Coverage Report

Created: 2025-11-24 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama-model.cpp
Line
Count
Source
1
#include "llama-model.h"
2
3
#include "llama-impl.h"
4
#include "llama-mmap.h"
5
#include "llama-batch.h"
6
#include "llama-cparams.h"
7
#include "llama-model-loader.h"
8
9
#include "llama-kv-cache.h"
10
#include "llama-kv-cache-iswa.h"
11
#include "llama-memory-hybrid.h"
12
#include "llama-memory-recurrent.h"
13
14
#include "ggml-cpp.h"
15
16
#include "models/models.h"
17
18
#include <algorithm>
19
#include <cassert>
20
#include <cfloat>
21
#include <cstring>
22
#include <cmath>
23
#include <functional>
24
#include <map>
25
#include <regex>
26
#include <sstream>
27
#include <stdexcept>
28
29
0
const char * llm_type_name(llm_type type) {
30
0
    switch (type) {
31
0
        case LLM_TYPE_14M:           return "14M";
32
0
        case LLM_TYPE_17M:           return "17M";
33
0
        case LLM_TYPE_22M:           return "22M";
34
0
        case LLM_TYPE_33M:           return "33M";
35
0
        case LLM_TYPE_60M:           return "60M";
36
0
        case LLM_TYPE_70M:           return "70M";
37
0
        case LLM_TYPE_80M:           return "80M";
38
0
        case LLM_TYPE_109M:          return "109M";
39
0
        case LLM_TYPE_137M:          return "137M";
40
0
        case LLM_TYPE_140M:          return "140M";
41
0
        case LLM_TYPE_160M:          return "160M";
42
0
        case LLM_TYPE_190M:          return "190M";
43
0
        case LLM_TYPE_220M:          return "220M";
44
0
        case LLM_TYPE_250M:          return "250M";
45
0
        case LLM_TYPE_256M:          return "256M";
46
0
        case LLM_TYPE_270M:          return "270M";
47
0
        case LLM_TYPE_335M:          return "335M";
48
0
        case LLM_TYPE_350M:          return "350M";
49
0
        case LLM_TYPE_360M:          return "360M";
50
0
        case LLM_TYPE_410M:          return "410M";
51
0
        case LLM_TYPE_450M:          return "450M";
52
0
        case LLM_TYPE_475M:          return "475M";
53
0
        case LLM_TYPE_558M:          return "558M";
54
0
        case LLM_TYPE_700M:          return "700M";
55
0
        case LLM_TYPE_770M:          return "770M";
56
0
        case LLM_TYPE_780M:          return "780M";
57
0
        case LLM_TYPE_950M:          return "950M";
58
0
        case LLM_TYPE_0_3B:          return "0.3B";
59
0
        case LLM_TYPE_0_5B:          return "0.5B";
60
0
        case LLM_TYPE_0_6B:          return "0.6B";
61
0
        case LLM_TYPE_1B:            return "1B";
62
0
        case LLM_TYPE_1_2B:          return "1.2B";
63
0
        case LLM_TYPE_1_3B:          return "1.3B";
64
0
        case LLM_TYPE_1_4B:          return "1.4B";
65
0
        case LLM_TYPE_1_5B:          return "1.5B";
66
0
        case LLM_TYPE_1_6B:          return "1.6B";
67
0
        case LLM_TYPE_1_7B:          return "1.7B";
68
0
        case LLM_TYPE_1_8B:          return "1.8B";
69
0
        case LLM_TYPE_2B:            return "2B";
70
0
        case LLM_TYPE_2_6B:          return "2.6B";
71
0
        case LLM_TYPE_2_8B:          return "2.8B";
72
0
        case LLM_TYPE_2_9B:          return "2.9B";
73
0
        case LLM_TYPE_3B:            return "3B";
74
0
        case LLM_TYPE_4B:            return "4B";
75
0
        case LLM_TYPE_6B:            return "6B";
76
0
        case LLM_TYPE_6_9B:          return "6.9B";
77
0
        case LLM_TYPE_7B:            return "7B";
78
0
        case LLM_TYPE_8B:            return "8B";
79
0
        case LLM_TYPE_9B:            return "9B";
80
0
        case LLM_TYPE_11B:           return "11B";
81
0
        case LLM_TYPE_12B:           return "12B";
82
0
        case LLM_TYPE_13B:           return "13B";
83
0
        case LLM_TYPE_14B:           return "14B";
84
0
        case LLM_TYPE_15B:           return "15B";
85
0
        case LLM_TYPE_16B:           return "16B";
86
0
        case LLM_TYPE_20B:           return "20B";
87
0
        case LLM_TYPE_26B:           return "26B";
88
0
        case LLM_TYPE_27B:           return "27B";
89
0
        case LLM_TYPE_30B:           return "30B";
90
0
        case LLM_TYPE_32B:           return "32B";
91
0
        case LLM_TYPE_34B:           return "34B";
92
0
        case LLM_TYPE_35B:           return "35B";
93
0
        case LLM_TYPE_36B:           return "36B";
94
0
        case LLM_TYPE_40B:           return "40B";
95
0
        case LLM_TYPE_65B:           return "65B";
96
0
        case LLM_TYPE_70B:           return "70B";
97
0
        case LLM_TYPE_120B:          return "120B";
98
0
        case LLM_TYPE_142B:          return "142B";
99
0
        case LLM_TYPE_236B:          return "236B";
100
0
        case LLM_TYPE_290B:          return "290B";
101
0
        case LLM_TYPE_314B:          return "314B";
102
0
        case LLM_TYPE_405B:          return "405B";
103
0
        case LLM_TYPE_671B:          return "671B";
104
0
        case LLM_TYPE_SMALL:         return "0.1B";
105
0
        case LLM_TYPE_MEDIUM:        return "0.4B";
106
0
        case LLM_TYPE_LARGE:         return "0.8B";
107
0
        case LLM_TYPE_XL:            return "1.5B";
108
0
        case LLM_TYPE_A1_7B:         return "A1.7B";
109
0
        case LLM_TYPE_A2_7B:         return "A2.7B";
110
0
        case LLM_TYPE_8x7B:          return "8x7B";
111
0
        case LLM_TYPE_8x22B:         return "8x22B";
112
0
        case LLM_TYPE_16x12B:        return "16x12B";
113
0
        case LLM_TYPE_16x3_8B:       return "16x3.8B";
114
0
        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
115
0
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
116
0
        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
117
0
        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
118
0
        case LLM_TYPE_A13B:          return "A13B";
119
0
        case LLM_TYPE_7B_A1B:        return "7B.A1B";
120
0
        case LLM_TYPE_8B_A1B:        return "8B.A1B";
121
0
        case LLM_TYPE_16B_A1B:       return "16B.A1B";
122
0
        case LLM_TYPE_21B_A3B:       return "21B.A3B";
123
0
        case LLM_TYPE_30B_A3B:       return "30B.A3B";
124
0
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
125
0
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
126
0
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
127
0
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
128
0
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
129
0
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
130
0
        case LLM_TYPE_E2B:           return "E2B";
131
0
        case LLM_TYPE_E4B:           return "E4B";
132
0
        default:                     return "?B";
133
0
    }
134
0
}
135
136
0
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
137
0
    switch (type) {
138
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
139
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
140
0
        default:                                    return "unknown";
141
0
    }
142
0
}
143
144
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
145
    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
146
    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
147
    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
148
    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
149
};
150
151
0
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
152
0
    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
153
0
}
154
155
0
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
156
0
    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
157
0
        if (kv.second == name) {
158
0
            return (llama_rope_scaling_type) kv.first;
159
0
        }
160
0
    }
161
162
0
    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
163
0
}
164
165
// checks if the weight tensor can be used with the specified buffer type and device
166
0
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
167
0
    GGML_ASSERT(w != nullptr);
168
169
0
    if (op == GGML_OP_NONE) {
170
0
        return true;
171
0
    }
172
173
0
    ggml_init_params params = {
174
0
        /*.mem_size   =*/ ggml_tensor_overhead()*8,
175
0
        /*.mem_buffer =*/ NULL,
176
0
        /*.no_alloc   =*/ true,
177
0
    };
178
0
    ggml_context_ptr ctx_ptr { ggml_init(params) };
179
0
    if (!ctx_ptr) {
180
0
        throw std::runtime_error(format("failed to create ggml context"));
181
0
    }
182
0
    ggml_context * ctx = ctx_ptr.get();
183
184
0
    ggml_tensor * op_tensor = nullptr;
185
186
0
    switch (op) {
187
0
        case GGML_OP_GET_ROWS:
188
0
            {
189
0
                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
190
0
                op_tensor = ggml_get_rows(ctx, w, b);
191
0
            } break;
192
0
        case GGML_OP_MUL_MAT:
193
0
            {
194
0
                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
195
0
                op_tensor = ggml_mul_mat(ctx, w, b);
196
0
            } break;
197
0
        case GGML_OP_MUL_MAT_ID:
198
0
            {
199
0
                int n_expert_used = hparams.n_expert_used;
200
0
                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
201
0
                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
202
0
                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
203
0
            } break;
204
0
        case GGML_OP_ADD:
205
0
            {
206
0
                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
207
0
                op_tensor = ggml_add(ctx, a, w);
208
0
            } break;
209
0
        case GGML_OP_ADD_ID:
210
0
            {
211
0
                int n_expert_used = hparams.n_expert_used;
212
0
                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
213
0
                ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
214
0
                op_tensor = ggml_add_id(ctx, a, w, c);
215
0
            } break;
216
0
        case GGML_OP_MUL:
217
0
            {
218
0
                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
219
0
                op_tensor = ggml_mul(ctx, a, w);
220
0
            } break;
221
0
        case GGML_OP_DIV:
222
0
            {
223
0
                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
224
0
                op_tensor = ggml_div(ctx, a, w);
225
0
            } break;
226
0
        case GGML_OP_ROPE:
227
0
            {
228
0
                int n_embd_head = hparams.n_embd_head_v;
229
0
                int n_head = hparams.n_head();
230
0
                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
231
0
                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
232
0
                op_tensor = ggml_rope_ext(
233
0
                    ctx, a, b, w,
234
0
                    0, 0, 0, 0, 0,
235
0
                    0, 0, 0, 0
236
0
                );
237
238
0
            } break;
239
0
        case GGML_OP_SSM_CONV:
240
0
            {
241
0
                const int64_t n_seq_tokens = 512;
242
0
                const int64_t n_seqs       = 3;
243
0
                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
244
0
                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
245
0
            } break;
246
0
        case GGML_OP_SSM_SCAN:
247
0
            {
248
                // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
249
0
                const int64_t d_state      = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
250
0
                const int64_t n_head       = w->ne[1];
251
0
                const int64_t head_dim     = hparams.ssm_d_inner / n_head;
252
0
                const int64_t n_group      = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
253
0
                const int64_t n_seq_tokens = 512;
254
0
                const int64_t n_seqs       = 3;
255
0
                ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
256
0
                ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
257
0
                ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
258
0
                ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
259
0
                ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
260
0
                ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
261
0
                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
262
0
            } break;
263
0
        case GGML_OP_RWKV_WKV6:
264
0
            {
265
                // FIXME
266
0
                const int64_t S = 123;
267
0
                const int64_t H = 123;
268
0
                const int64_t n_tokens = 123;
269
0
                const int64_t n_seqs = 123;
270
0
                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
271
0
                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
272
0
                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
273
0
                ggml_tensor  * tf = w;
274
0
                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
275
0
                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
276
0
                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
277
0
            } break;
278
0
        case GGML_OP_IM2COL:
279
0
            {
280
0
                const int n_embd_inp = hparams.n_embd_inp();
281
0
                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
282
0
                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
283
0
            } break;
284
0
        case GGML_OP_SCALE:
285
0
            {
286
0
                op_tensor = ggml_scale(ctx, w, 1.0f);
287
0
            } break;
288
0
        default:
289
0
            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
290
0
    }
291
292
    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
293
0
    GGML_ASSERT(w->buffer == nullptr);
294
0
    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
295
0
    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
296
0
    ggml_backend_buffer_free(w->buffer);
297
0
    w->buffer = nullptr;
298
299
0
    return op_supported;
300
0
}
301
302
// lists of buffer types used for each layer
303
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
304
305
// find the first buffer type in the list that can use the tensor
306
0
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
307
0
    GGML_ASSERT(!buft_list.empty());
308
0
    for (const auto & cur : buft_list) {
309
0
        ggml_backend_dev_t cur_dev = cur.first;
310
0
        ggml_backend_buffer_type_t cur_buft = cur.second;
311
0
        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
312
0
            return cur_buft;
313
0
        }
314
0
    }
315
316
0
    return nullptr;
317
0
}
318
319
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
320
0
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
321
0
    buft_list_t buft_list;
322
323
    // add ACCEL buffer types
324
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
325
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
326
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
327
0
            auto * buft = ggml_backend_dev_buffer_type(dev);
328
            // skip
329
0
            if (buft != ggml_backend_cpu_buffer_type()) {
330
0
                buft_list.emplace_back(dev, buft);
331
0
            }
332
0
        }
333
0
    }
334
335
    // add a host buffer type
336
    // storing the tensors in a host buffer is useful when the processing of large batches
337
    // is offloaded to a GPU device, since it reduces the time spent on data transfers
338
    // generally, this will be done using the first device in the list
339
    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
340
    // function of the device to determine if it would benefit from being stored in a host buffer
341
0
    if (!no_host) {
342
0
        for (auto * dev : devices) {
343
0
            ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
344
0
            if (buft) {
345
0
                buft_list.emplace_back(dev, buft);
346
0
                break;
347
0
            }
348
0
        }
349
0
    }
350
351
    // add extra buffer types
352
0
    if (use_extra_bufts) {
353
0
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
354
0
        if (cpu_dev == nullptr) {
355
0
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
356
0
        }
357
358
0
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
359
0
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
360
0
            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
361
0
        if (ggml_backend_dev_get_extra_bufts_fn) {
362
0
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
363
0
            while (extra_bufts && *extra_bufts) {
364
0
                buft_list.emplace_back(cpu_dev, *extra_bufts);
365
0
                ++extra_bufts;
366
0
            }
367
0
        }
368
0
    }
369
370
    // add the CPU buffer type
371
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
372
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
373
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
374
0
            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
375
0
        }
376
0
    }
377
378
0
    return buft_list;
379
0
}
380
381
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
382
0
static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
383
0
    buft_list_t buft_list;
384
385
    // add the device split buffer type if requested and available
386
0
    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
387
0
        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
388
0
        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
389
0
            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
390
0
        if (ggml_backend_split_buffer_type_fn) {
391
0
            size_t dev_index = [&]() {
392
0
                auto * reg = ggml_backend_dev_backend_reg(dev);
393
0
                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
394
0
                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
395
0
                        return i;
396
0
                    }
397
0
                }
398
0
                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
399
0
            }();
400
0
            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
401
0
            if (buft != nullptr) {
402
0
                buft_list.emplace_back(dev, buft);
403
0
            }
404
0
        }
405
0
    }
406
407
    // add the device default buffer type
408
0
    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
409
410
    // add the device extra buffer type (if any)
411
0
    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
412
0
    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
413
0
        ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
414
415
0
    if (ggml_backend_dev_get_extra_bufts_fn) {
416
0
        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
417
0
        while (extra_bufts && *extra_bufts) {
418
0
            buft_list.emplace_back(dev, *extra_bufts);
419
0
            ++extra_bufts;
420
0
        }
421
0
    }
422
423
0
    return buft_list;
424
0
}
425
426
struct llama_model::impl {
427
0
    impl() {}
428
0
    ~impl() {}
429
430
    uint64_t n_elements = 0;
431
432
    size_t n_bytes = 0;
433
434
    std::string desc_str;
435
436
    // model memory mapped files
437
    llama_mmaps mappings;
438
439
    // objects representing data potentially being locked in memory
440
    llama_mlocks mlock_bufs;
441
    llama_mlocks mlock_mmaps;
442
443
    // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
444
    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
445
446
    buft_list_t cpu_buft_list;
447
    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
448
449
    struct layer_dev {
450
        ggml_backend_dev_t dev;
451
        buft_list_t * buft_list;
452
    };
453
454
    layer_dev dev_input = {};
455
    layer_dev dev_output = {};
456
    std::vector<layer_dev> dev_layer;
457
458
    bool has_tensor_overrides;
459
};
460
461
0
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
462
0
    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
463
0
}
464
465
0
llama_model::~llama_model() {}
466
467
0
void llama_model::load_stats(llama_model_loader & ml) {
468
0
    pimpl->n_elements = ml.n_elements;
469
0
    pimpl->n_bytes = ml.n_bytes;
470
0
}
471
472
0
void llama_model::load_arch(llama_model_loader & ml) {
473
0
    arch = ml.get_arch();
474
0
    if (arch == LLM_ARCH_UNKNOWN) {
475
0
        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
476
0
    }
477
0
}
478
479
0
void llama_model::load_hparams(llama_model_loader & ml) {
480
0
    const gguf_context * ctx = ml.meta.get();
481
482
    // get metadata as string
483
0
    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
484
0
        gguf_type type = gguf_get_kv_type(ctx, i);
485
0
        if (type == GGUF_TYPE_ARRAY) {
486
0
            continue;
487
0
        }
488
0
        const char * name = gguf_get_key(ctx, i);
489
0
        const std::string value = gguf_kv_to_str(ctx, i);
490
0
        gguf_kv.emplace(name, value);
491
0
    }
492
493
    // get general kv
494
0
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
495
496
    // everything past this point is not vocab-related
497
    // for CLIP models, we only need to load tensors, no hparams
498
0
    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
499
0
        return;
500
0
    }
501
502
0
    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
503
0
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
504
0
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
505
0
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
506
0
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
507
0
    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
508
0
    ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
509
510
0
    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
511
0
        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
512
513
0
        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
514
0
        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
515
516
0
        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
517
0
        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
518
0
    }
519
520
0
    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
521
0
    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
522
0
    if (hparams.n_expert > 0) {
523
0
        GGML_ASSERT(hparams.n_expert_used > 0);
524
0
        GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
525
0
        if (hparams.n_expert_groups > 1) {
526
0
            GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
527
0
            GGML_ASSERT(hparams.n_group_used > 0);
528
0
            GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
529
0
        }
530
0
    } else {
531
0
        GGML_ASSERT(hparams.n_expert_used == 0);
532
0
        GGML_ASSERT(hparams.n_expert_groups == 0);
533
0
    }
534
535
0
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
536
0
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
537
0
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
538
0
    std::fill(
539
0
        hparams.recurrent_layer_arr.begin(),
540
0
        hparams.recurrent_layer_arr.end(),
541
0
        llm_arch_is_recurrent(ml.get_arch()));
542
543
0
    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
544
0
    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
545
546
0
    std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
547
0
    std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
548
0
    std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
549
0
    std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
550
551
0
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
552
0
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
553
554
    // n_head_kv is optional, default to n_head
555
0
    hparams.n_head_kv_arr = hparams.n_head_arr;
556
557
0
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
558
559
0
    bool rope_finetuned = false;
560
0
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
561
0
    hparams.rope_finetuned = rope_finetuned;
562
563
0
    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
564
0
    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
565
566
    // rope_freq_base (optional)
567
0
    hparams.rope_freq_base_train = 10000.0f;
568
0
    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
569
570
0
    std::string rope_scaling("linear");
571
0
    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
572
0
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
573
0
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
574
575
    // rope_freq_scale (inverse of the kv) is optional
576
0
    float ropescale = 0.0f;
577
0
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
578
        // try the old key name
579
0
        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
580
0
    }
581
0
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
582
583
    // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
584
0
    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
585
0
    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
586
587
0
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
588
589
    // non-transformer models do not have attention heads
590
0
    if (hparams.n_head() > 0) {
591
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
592
        // gpt-j n_rot = rotary_dim
593
594
0
        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
595
0
        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
596
597
0
        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
598
0
        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
599
600
        // sanity check for n_rot (optional)
601
0
        hparams.n_rot = hparams.n_embd_head_k;
602
603
0
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
604
605
0
        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
606
0
            if (hparams.n_rot != hparams.n_embd_head_k) {
607
0
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
608
0
            }
609
0
        }
610
0
    } else {
611
0
        hparams.n_rot = 0;
612
0
        hparams.n_embd_head_k = 0;
613
0
        hparams.n_embd_head_v = 0;
614
0
    }
615
616
    // for differentiating model types
617
0
    uint32_t n_vocab = 0;
618
0
    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
619
620
    // for classifier models
621
0
    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
622
0
    if (!classifier_labels.empty()) {
623
0
        hparams.n_cls_out = classifier_labels.size();
624
0
    }
625
626
    // arch-specific KVs
627
0
    switch (arch) {
628
0
        case LLM_ARCH_LLAMA:
629
0
            {
630
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
631
632
0
                if (hparams.n_expert == 8) {
633
0
                    switch (hparams.n_layer) {
634
0
                        case 32: type = LLM_TYPE_8x7B; break;
635
0
                        case 56: type = LLM_TYPE_8x22B; break;
636
0
                        default: type = LLM_TYPE_UNKNOWN;
637
0
                    }
638
0
                } else {
639
0
                    switch (hparams.n_layer) {
640
0
                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
641
0
                        case 22: type = LLM_TYPE_1B; break;
642
0
                        case 26: type = LLM_TYPE_3B; break;
643
0
                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
644
0
                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
645
                        // granite uses a vocab with len 49152
646
0
                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
647
0
                        case 36: type = LLM_TYPE_8B; break; // granite
648
0
                        case 40: type = LLM_TYPE_13B; break;
649
0
                        case 48: type = LLM_TYPE_34B; break;
650
0
                        case 60: type = LLM_TYPE_30B; break;
651
0
                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
652
0
                        default: type = LLM_TYPE_UNKNOWN;
653
0
                    }
654
0
                }
655
0
            } break;
656
0
        case LLM_ARCH_LLAMA4:
657
0
            {
658
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
659
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
660
0
                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
661
662
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
663
0
                if (found_swa && hparams.n_swa == 0) {
664
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
665
0
                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
666
0
                } else {
667
0
                    hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
668
0
                    hparams.n_swa         = 8192;
669
0
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
670
0
                }
671
672
0
                switch (hparams.n_expert) {
673
0
                    case 0: {
674
                        // MobileLLM (no MoE)
675
0
                        switch (hparams.n_embd) {
676
0
                            case 2048: type = LLM_TYPE_140M; break;
677
0
                            case 4096: type = LLM_TYPE_360M; break;
678
0
                            case 6144: type = LLM_TYPE_950M; break;
679
0
                            default:   type = LLM_TYPE_UNKNOWN;
680
0
                        }
681
0
                    } break;
682
0
                    case 16:  type = LLM_TYPE_17B_16E; break;
683
0
                    case 128: type = LLM_TYPE_17B_128E; break;
684
0
                    default:  type = LLM_TYPE_UNKNOWN;
685
0
                }
686
687
0
                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
688
0
            } break;
689
0
        case LLM_ARCH_ARCEE:
690
0
            {
691
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
692
693
                // Arcee uses the same structure as Llama
694
0
                switch (hparams.n_layer) {
695
0
                    case 36: type = LLM_TYPE_4B; break;
696
0
                    default: type = LLM_TYPE_UNKNOWN;
697
0
                }
698
0
            } break;
699
0
        case LLM_ARCH_AFMOE:
700
0
            {
701
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
702
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
703
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
704
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
705
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
706
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
707
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
708
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
709
710
                // Set up interleaved sliding window attention (ISWA)
711
                // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
712
0
                if (hparams.n_swa > 0) {
713
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
714
0
                    hparams.set_swa_pattern(4);
715
0
                } else {
716
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
717
0
                }
718
719
                // Default to sigmoid if not set
720
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
721
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
722
0
                }
723
724
0
                switch (hparams.n_layer) {
725
0
                    case 56: type = LLM_TYPE_6B; break;
726
0
                    case 32: type = LLM_TYPE_26B; break;
727
0
                    default: type = LLM_TYPE_UNKNOWN;
728
0
                }
729
0
            } break;
730
0
        case LLM_ARCH_DECI:
731
0
            {
732
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
733
0
                switch (hparams.n_layer) {
734
0
                    case 32: type = LLM_TYPE_7B; break;
735
0
                    case 80: type = LLM_TYPE_70B; break;
736
0
                    case 162: type = LLM_TYPE_405B; break;
737
0
                    default: type = LLM_TYPE_UNKNOWN;
738
0
                }
739
0
            } break;
740
0
        case LLM_ARCH_MINICPM:
741
0
            {
742
                // Backward-compatible defaults for older MiniCPM GGUFs
743
0
                hparams.f_embedding_scale = 12.0f;
744
0
                hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
745
0
                hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
746
747
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
748
749
                // Optional KV reads, override defaults if present in newer GGUF exports
750
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
751
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
752
0
                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
753
754
                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
755
0
                hparams.rope_finetuned = true;
756
757
0
                switch (hparams.n_layer) {
758
0
                    case 52: type = LLM_TYPE_1B; break;
759
0
                    case 40: type = LLM_TYPE_2B; break;
760
0
                    default: type = LLM_TYPE_UNKNOWN;
761
0
                }
762
0
            } break;
763
0
        case LLM_ARCH_MINICPM3:
764
0
            {
765
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
766
0
                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
767
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
768
769
0
                switch (hparams.n_layer) {
770
0
                    case 62: type = LLM_TYPE_4B; break;
771
0
                    default: type = LLM_TYPE_UNKNOWN;
772
0
                }
773
0
            } break;
774
0
        case LLM_ARCH_GROK:
775
0
            {
776
                // defaults for old GGUFs
777
0
                hparams.yarn_beta_fast = 8.0f;
778
0
                hparams.f_logit_scale = 0.5773502691896257f;
779
0
                hparams.f_embedding_scale = 78.38367176906169f;
780
0
                hparams.f_attn_out_scale = 0.08838834764831845f;
781
0
                hparams.f_attn_logit_softcapping = 30.0f;
782
0
                hparams.f_router_logit_softcapping = 30.0f;
783
                // no final_logit_softcapping in grok-1
784
0
                hparams.f_final_logit_softcapping = 0.0f;
785
786
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
787
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
788
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
789
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
790
0
                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
791
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
792
0
                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
793
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
794
795
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
796
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
797
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
798
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
799
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
800
801
0
                switch (hparams.n_layer) {
802
0
                    case 64: type = LLM_TYPE_314B; break;
803
0
                    default: type = LLM_TYPE_UNKNOWN;
804
0
                }
805
0
            } break;
806
0
        case LLM_ARCH_FALCON:
807
0
            {
808
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
809
810
0
                switch (hparams.n_layer) {
811
0
                    case 32: type = LLM_TYPE_7B; break;
812
0
                    case 60: type = LLM_TYPE_40B; break;
813
0
                    default: type = LLM_TYPE_UNKNOWN;
814
0
                }
815
0
            } break;
816
0
        case LLM_ARCH_BAICHUAN:
817
0
            {
818
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
819
0
                switch (hparams.n_layer) {
820
0
                    case 32: type = LLM_TYPE_7B; break;
821
0
                    case 40: type = LLM_TYPE_13B; break;
822
0
                    default: type = LLM_TYPE_UNKNOWN;
823
0
                }
824
825
0
                if (type == LLM_TYPE_13B) {
826
                    // TODO: become GGUF KV parameter
827
0
                    hparams.f_max_alibi_bias = 8.0f;
828
0
                }
829
0
            } break;
830
0
        case LLM_ARCH_STARCODER:
831
0
            {
832
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
833
0
                switch (hparams.n_layer) {
834
0
                    case 24: type = LLM_TYPE_1B; break;
835
0
                    case 36: type = LLM_TYPE_3B; break;
836
0
                    case 42: type = LLM_TYPE_7B; break;
837
0
                    case 40: type = LLM_TYPE_15B; break;
838
0
                    default: type = LLM_TYPE_UNKNOWN;
839
0
                }
840
0
            } break;
841
0
        case LLM_ARCH_REFACT:
842
0
            {
843
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
844
0
                switch (hparams.n_layer) {
845
0
                    case 32: type = LLM_TYPE_1B; break;
846
0
                    default: type = LLM_TYPE_UNKNOWN;
847
0
                }
848
849
                // TODO: become GGUF KV parameter
850
0
                hparams.f_max_alibi_bias = 8.0f;
851
0
            } break;
852
0
        case LLM_ARCH_BERT:
853
0
            {
854
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
855
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
856
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
857
858
0
                switch (hparams.n_layer) {
859
0
                    case 3:
860
0
                        type = LLM_TYPE_17M; break; // bge-micro
861
0
                    case 6:
862
0
                        type = LLM_TYPE_22M; break; // MiniLM-L6
863
0
                    case 12:
864
0
                        switch (hparams.n_embd) {
865
0
                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
866
0
                            case 768: type = LLM_TYPE_109M; break; // bge-base
867
0
                            default: type = LLM_TYPE_UNKNOWN;
868
0
                        } break;
869
0
                    case 24:
870
0
                        type = LLM_TYPE_335M; break; // bge-large
871
0
                    default: type = LLM_TYPE_UNKNOWN;
872
0
                }
873
0
            } break;
874
0
        case LLM_ARCH_JINA_BERT_V2:
875
0
            {
876
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
877
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
878
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
879
0
                hparams.f_max_alibi_bias = 8.0f;
880
881
0
                switch (hparams.n_layer) {
882
0
                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
883
0
                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
884
0
                    default: type = LLM_TYPE_UNKNOWN;
885
0
                }
886
0
            } break;
887
0
        case LLM_ARCH_JINA_BERT_V3:
888
0
            {
889
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
890
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
891
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
892
893
0
                switch (hparams.n_layer) {
894
0
                    case 24:
895
0
                        type = LLM_TYPE_558M; break;
896
0
                    default: type = LLM_TYPE_UNKNOWN;
897
0
                }
898
0
            } break;
899
0
        case LLM_ARCH_NOMIC_BERT:
900
0
        case LLM_ARCH_NOMIC_BERT_MOE:
901
0
            {
902
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
903
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
904
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
905
0
                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
906
907
0
                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
908
0
                    if (arch == LLM_ARCH_NOMIC_BERT) {
909
0
                        type = LLM_TYPE_137M;
910
0
                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
911
0
                        type = LLM_TYPE_475M;
912
0
                    }
913
0
                }
914
0
            } break;
915
0
        case LLM_ARCH_NEO_BERT:
916
0
            {
917
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
918
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
919
0
                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
920
921
0
                if (hparams.n_layer == 28) {
922
0
                    type = LLM_TYPE_250M;
923
0
                }
924
0
            } break;
925
0
        case LLM_ARCH_BLOOM:
926
0
            {
927
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
928
929
0
                switch (hparams.n_layer) {
930
0
                    case 24: type = LLM_TYPE_1B; break;
931
0
                    case 30:
932
0
                        switch (hparams.n_embd) {
933
0
                            case 2560: type = LLM_TYPE_3B; break;
934
0
                            case 4096: type = LLM_TYPE_7B; break;
935
0
                            default: type = LLM_TYPE_UNKNOWN;
936
0
                        } break;
937
0
                    default: type = LLM_TYPE_UNKNOWN;
938
0
                }
939
940
                // TODO: become GGUF KV parameter
941
0
                hparams.f_max_alibi_bias = 8.0f;
942
0
            } break;
943
0
        case LLM_ARCH_MPT:
944
0
            {
945
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
946
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
947
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
948
949
0
                switch (hparams.n_layer) {
950
0
                    case 32: type = LLM_TYPE_7B; break;
951
0
                    case 48: type = LLM_TYPE_30B; break;
952
0
                    default: type = LLM_TYPE_UNKNOWN;
953
0
                }
954
0
            } break;
955
0
        case LLM_ARCH_STABLELM:
956
0
            {
957
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
958
959
0
                switch (hparams.n_layer) {
960
0
                    case 24: type = LLM_TYPE_1B; break;
961
0
                    case 32: type = LLM_TYPE_3B; break;
962
0
                    case 40: type = LLM_TYPE_12B; break;
963
0
                    default: type = LLM_TYPE_UNKNOWN;
964
0
               }
965
0
            } break;
966
0
        case LLM_ARCH_QWEN:
967
0
            {
968
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
969
970
0
                switch (hparams.n_layer) {
971
0
                    case 32: type = LLM_TYPE_7B; break;
972
0
                    case 40: type = LLM_TYPE_13B; break;
973
0
                    default: type = LLM_TYPE_UNKNOWN;
974
0
                }
975
0
            } break;
976
0
        case LLM_ARCH_QWEN2VL:
977
0
            {
978
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
979
0
            }
980
            // fall through
981
0
        case LLM_ARCH_QWEN2:
982
0
            {
983
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
984
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
985
0
                switch (hparams.n_layer) {
986
0
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
987
0
                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
988
0
                    case 32: type = LLM_TYPE_7B; break;
989
0
                    case 36: type = LLM_TYPE_3B; break;
990
0
                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
991
0
                    case 48: type = LLM_TYPE_14B; break;
992
0
                    case 64: type = LLM_TYPE_32B; break;
993
0
                    case 80: type = LLM_TYPE_70B; break;
994
0
                    default: type = LLM_TYPE_UNKNOWN;
995
0
                }
996
0
            } break;
997
0
        case LLM_ARCH_DREAM:
998
0
            {
999
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1000
                // Dream models are primarily 7B with 28 layers
1001
0
                switch (hparams.n_layer) {
1002
0
                    case 28:
1003
0
                        type = LLM_TYPE_7B;
1004
0
                        break;
1005
0
                    default:
1006
0
                        type = LLM_TYPE_UNKNOWN;
1007
0
                }
1008
                // Set non-causal attention for diffusion models
1009
0
                hparams.causal_attn = false;
1010
0
            }
1011
0
            break;
1012
0
        case LLM_ARCH_LLADA:
1013
0
            {
1014
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1015
                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
1016
0
                switch (hparams.n_layer) {
1017
0
                    case 32:
1018
0
                        type = LLM_TYPE_8B;
1019
0
                        break;
1020
0
                    default:
1021
0
                        type = LLM_TYPE_UNKNOWN;
1022
0
                }
1023
                // Set non-causal attention for diffusion models
1024
0
                hparams.causal_attn = false;
1025
0
            }
1026
0
            break;
1027
0
        case LLM_ARCH_LLADA_MOE:
1028
0
            {
1029
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1030
1031
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1032
                // diffusion language model uses non-causal attention
1033
0
                hparams.causal_attn = false;
1034
0
                switch (hparams.n_layer) {
1035
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1036
0
                    default: type = LLM_TYPE_UNKNOWN;
1037
0
                }
1038
0
            } break;
1039
0
        case LLM_ARCH_QWEN2MOE:
1040
0
            {
1041
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1042
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1043
1044
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1045
0
                switch (hparams.n_layer) {
1046
0
                    case 24: type = LLM_TYPE_A2_7B; break;
1047
0
                    case 28: type = LLM_TYPE_57B_A14B; break;
1048
0
                    default: type = LLM_TYPE_UNKNOWN;
1049
0
                }
1050
0
            } break;
1051
0
        case LLM_ARCH_QWEN3:
1052
0
            {
1053
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1054
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1055
0
                switch (hparams.n_layer) {
1056
0
                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1057
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1058
0
                    case 40: type = LLM_TYPE_14B; break;
1059
0
                    case 64: type = LLM_TYPE_32B; break;
1060
0
                    default: type = LLM_TYPE_UNKNOWN;
1061
0
                }
1062
0
            } break;
1063
0
        case LLM_ARCH_QWEN3VL:
1064
0
            {
1065
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1066
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1067
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1068
0
                switch (hparams.n_layer) {
1069
0
                    case 28: type = LLM_TYPE_1_7B; break;
1070
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1071
0
                    case 64: type = LLM_TYPE_32B; break;
1072
0
                    default: type = LLM_TYPE_UNKNOWN;
1073
0
                }
1074
0
            } break;
1075
0
        case LLM_ARCH_QWEN3MOE:
1076
0
            {
1077
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1078
1079
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1080
0
                switch (hparams.n_layer) {
1081
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1082
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1083
0
                    default: type = LLM_TYPE_UNKNOWN;
1084
0
                }
1085
0
            } break;
1086
0
        case LLM_ARCH_QWEN3VLMOE:
1087
0
            {
1088
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1089
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1090
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1091
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1092
0
                switch (hparams.n_layer) {
1093
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1094
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1095
0
                    default: type = LLM_TYPE_UNKNOWN;
1096
0
                }
1097
0
            } break;
1098
0
        case LLM_ARCH_PHI2:
1099
0
            {
1100
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1101
1102
0
                switch (hparams.n_layer) {
1103
0
                    case 24: type = LLM_TYPE_1B; break;
1104
0
                    case 32: type = LLM_TYPE_3B; break;
1105
0
                    default: type = LLM_TYPE_UNKNOWN;
1106
0
                }
1107
0
            } break;
1108
0
        case LLM_ARCH_PHI3:
1109
0
            {
1110
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1111
1112
0
                switch (hparams.n_layer) {
1113
0
                    case 24: type = LLM_TYPE_1B; break;
1114
0
                    case 32: type = LLM_TYPE_3B; break;
1115
0
                    case 40: type = LLM_TYPE_14B; break;
1116
0
                    default: type = LLM_TYPE_UNKNOWN;
1117
0
                }
1118
1119
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1120
1121
0
                if (found_swa && hparams.n_swa > 0) {
1122
0
                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1123
0
                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1124
1125
                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1126
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1127
1128
0
                    hparams.n_swa         = 0;
1129
0
                    hparams.set_swa_pattern(1);
1130
0
                }
1131
0
            } break;
1132
0
        case LLM_ARCH_PHIMOE:
1133
0
            {
1134
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1135
1136
0
                switch (hparams.n_layer) {
1137
0
                    case 32: type = LLM_TYPE_16x3_8B; break;
1138
0
                    default: type = LLM_TYPE_UNKNOWN;
1139
0
                }
1140
0
            } break;
1141
0
        case LLM_ARCH_PLAMO:
1142
0
            {
1143
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1144
1145
0
                switch (hparams.n_layer) {
1146
0
                    case 40: type = LLM_TYPE_13B; break;
1147
0
                    default: type = LLM_TYPE_UNKNOWN;
1148
0
               }
1149
0
            } break;
1150
0
        case LLM_ARCH_PLAMO2:
1151
0
            {
1152
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1153
1154
                // Load Mamba SSM parameters
1155
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1156
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1157
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1158
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1159
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1160
1161
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1162
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1163
0
                }
1164
1165
0
                switch (hparams.n_layer) {
1166
0
                    case 16: type = LLM_TYPE_1B; break;
1167
0
                    case 32:
1168
0
                        if (hparams.n_embd == 2048) {
1169
0
                            type = LLM_TYPE_2B;
1170
0
                        } else if (hparams.n_embd == 4096) {
1171
0
                            type = LLM_TYPE_8B;
1172
0
                        }
1173
0
                        break;
1174
0
                    default: type = LLM_TYPE_UNKNOWN;
1175
0
                }
1176
1177
                // Load attention parameters
1178
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
1179
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1180
0
            } break;
1181
0
        case LLM_ARCH_GPT2:
1182
0
            {
1183
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1184
0
                switch (hparams.n_layer) {
1185
0
                    case 12: type = LLM_TYPE_SMALL; break;
1186
0
                    case 24: type = LLM_TYPE_MEDIUM; break;
1187
0
                    case 36: type = LLM_TYPE_LARGE; break;
1188
0
                    case 48: type = LLM_TYPE_XL; break;
1189
0
                    default: type = LLM_TYPE_UNKNOWN;
1190
0
                }
1191
0
            } break;
1192
0
        case LLM_ARCH_CODESHELL:
1193
0
            {
1194
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1195
0
                switch (hparams.n_layer) {
1196
0
                    case 42: type = LLM_TYPE_7B; break;
1197
0
                    default: type = LLM_TYPE_UNKNOWN;
1198
0
                }
1199
0
            } break;
1200
0
        case LLM_ARCH_ORION:
1201
0
            {
1202
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1203
1204
0
                switch (hparams.n_layer) {
1205
0
                    case 40: type = LLM_TYPE_14B; break;
1206
0
                    default: type = LLM_TYPE_UNKNOWN;
1207
0
                }
1208
0
            } break;
1209
0
        case LLM_ARCH_INTERNLM2:
1210
0
            {
1211
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1212
0
                switch (hparams.n_layer) {
1213
0
                    case 32: type = LLM_TYPE_7B; break;
1214
0
                    case 48: type = LLM_TYPE_20B; break;
1215
0
                    default: type = LLM_TYPE_UNKNOWN;
1216
0
                }
1217
0
            } break;
1218
0
        case LLM_ARCH_GEMMA:
1219
0
            {
1220
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1221
1222
0
                switch (hparams.n_layer) {
1223
0
                    case 18: type = LLM_TYPE_2B; break;
1224
0
                    case 28: type = LLM_TYPE_7B; break;
1225
0
                    default: type = LLM_TYPE_UNKNOWN;
1226
0
               }
1227
0
            } break;
1228
0
        case LLM_ARCH_GEMMA2:
1229
0
            {
1230
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1231
0
                hparams.n_swa = 4096; // default value of gemma 2
1232
0
                hparams.set_swa_pattern(2);
1233
0
                hparams.attn_soft_cap = true;
1234
1235
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1236
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1237
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
1238
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
1239
1240
0
                switch (hparams.n_layer) {
1241
0
                    case 26: type = LLM_TYPE_2B; break;
1242
0
                    case 42: type = LLM_TYPE_9B; break;
1243
0
                    case 46: type = LLM_TYPE_27B; break;
1244
0
                    default: type = LLM_TYPE_UNKNOWN;
1245
0
               }
1246
1247
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1248
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1249
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1250
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1251
0
            } break;
1252
0
        case LLM_ARCH_GEMMA3:
1253
0
            {
1254
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1255
0
                hparams.set_swa_pattern(6);
1256
1257
0
                hparams.rope_freq_base_train_swa  = 10000.0f;
1258
0
                hparams.rope_freq_scale_train_swa = 1.0f;
1259
1260
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1261
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1262
1263
0
                switch (hparams.n_layer) {
1264
0
                    case 18: type = LLM_TYPE_270M; break;
1265
0
                    case 26: type = LLM_TYPE_1B; break;
1266
0
                    case 34: type = LLM_TYPE_4B; break;
1267
0
                    case 48: type = LLM_TYPE_12B; break;
1268
0
                    case 62: type = LLM_TYPE_27B; break;
1269
0
                    default: type = LLM_TYPE_UNKNOWN;
1270
0
                }
1271
1272
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1273
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1274
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1275
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1276
0
            } break;
1277
0
        case LLM_ARCH_GEMMA3N:
1278
0
            {
1279
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1280
0
                hparams.set_swa_pattern(5);
1281
1282
0
                hparams.n_layer_kv_from_start     = 20;
1283
0
                hparams.rope_freq_base_train_swa  = 10000.0f;
1284
0
                hparams.rope_freq_scale_train_swa = 1.0f;
1285
0
                hparams.f_attention_scale         = 1.0f;
1286
1287
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1288
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1289
1290
0
                switch (hparams.n_layer) {
1291
0
                    case 30: type = LLM_TYPE_E2B; break;
1292
0
                    case 35: type = LLM_TYPE_E4B; break;
1293
0
                    default: type = LLM_TYPE_UNKNOWN;
1294
0
                }
1295
0
            } break;
1296
0
        case LLM_ARCH_GEMMA_EMBEDDING:
1297
0
            {
1298
0
                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1299
0
                hparams.set_swa_pattern(6);
1300
1301
0
                hparams.causal_attn = false; // embeddings do not use causal attention
1302
0
                hparams.rope_freq_base_train_swa = 10000.0f;
1303
0
                hparams.rope_freq_scale_train_swa = 1.0f;
1304
1305
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1306
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1307
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1308
1309
                //applied only if model converted with --sentence-transformers-dense-modules
1310
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1311
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1312
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1313
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1314
1315
0
                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1316
0
                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1317
1318
0
                switch (hparams.n_layer) {
1319
0
                    case 24: type = LLM_TYPE_0_3B; break;
1320
0
                    default: type = LLM_TYPE_UNKNOWN;
1321
0
                }
1322
0
                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1323
1324
0
            } break;
1325
0
        case LLM_ARCH_STARCODER2:
1326
0
            {
1327
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1328
0
                switch (hparams.n_layer) {
1329
0
                    case 30: type = LLM_TYPE_3B; break;
1330
0
                    case 32: type = LLM_TYPE_7B; break;
1331
0
                    case 40: type = LLM_TYPE_15B; break;
1332
0
                    case 52: type = LLM_TYPE_20B; break; // granite
1333
0
                    case 88: type = LLM_TYPE_34B; break; // granite
1334
0
                    default: type = LLM_TYPE_UNKNOWN;
1335
0
                }
1336
0
            } break;
1337
0
        case LLM_ARCH_MAMBA:
1338
0
            {
1339
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1340
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1341
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1342
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1343
0
                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
1344
1345
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1346
1347
0
                switch (hparams.n_layer) {
1348
0
                    case 24:
1349
0
                        switch (hparams.n_embd) {
1350
0
                            case 768: type = LLM_TYPE_SMALL; break;
1351
0
                            default: type = LLM_TYPE_UNKNOWN;
1352
0
                        } break;
1353
0
                    case 48:
1354
0
                        switch (hparams.n_embd) {
1355
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1356
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1357
0
                            case 2048: type = LLM_TYPE_XL; break;
1358
0
                            default:   type = LLM_TYPE_UNKNOWN;
1359
0
                        } break;
1360
0
                    case 64:
1361
0
                        switch (hparams.n_embd) {
1362
0
                            case 2560: type = LLM_TYPE_3B; break;
1363
0
                            default: type = LLM_TYPE_UNKNOWN;
1364
0
                        } break;
1365
0
                    default: type = LLM_TYPE_UNKNOWN;
1366
0
                }
1367
0
            } break;
1368
0
        case LLM_ARCH_MAMBA2:
1369
0
            {
1370
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1371
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1372
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1373
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1374
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1375
1376
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1377
1378
0
                switch (hparams.n_layer) {
1379
0
                    case 24:
1380
0
                        switch (hparams.n_embd) {
1381
0
                            case 768: type = LLM_TYPE_SMALL; break;
1382
0
                            default: type = LLM_TYPE_UNKNOWN;
1383
0
                        } break;
1384
0
                    case 48:
1385
0
                        switch (hparams.n_embd) {
1386
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1387
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1388
0
                            case 2048: type = LLM_TYPE_XL; break;
1389
0
                            default: type = LLM_TYPE_UNKNOWN;
1390
0
                        } break;
1391
0
                    case 64:
1392
0
                        switch (hparams.n_embd) {
1393
0
                            case 2560: type = LLM_TYPE_3B; break;
1394
0
                            case 4096: type = LLM_TYPE_7B; break;
1395
0
                            default: type = LLM_TYPE_UNKNOWN;
1396
0
                        } break;
1397
0
                    default: type = LLM_TYPE_UNKNOWN;
1398
0
                }
1399
0
            } break;
1400
0
        case LLM_ARCH_JAMBA:
1401
0
            {
1402
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1403
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1404
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1405
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1406
1407
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1408
1409
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1410
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1411
0
                }
1412
1413
0
                switch (hparams.n_layer) {
1414
                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1415
0
                    case 12: // 900M  8x???M
1416
0
                    case 32: // 51B  16x?B
1417
0
                    default: type = LLM_TYPE_UNKNOWN;
1418
0
                }
1419
0
            } break;
1420
0
        case LLM_ARCH_XVERSE:
1421
0
            {
1422
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1423
0
                switch (hparams.n_layer) {
1424
0
                    case 32: type = LLM_TYPE_7B; break;
1425
0
                    case 40: type = LLM_TYPE_13B; break;
1426
0
                    case 80: type = LLM_TYPE_65B; break;
1427
0
                    default: type = LLM_TYPE_UNKNOWN;
1428
0
                }
1429
0
            } break;
1430
0
        case LLM_ARCH_COMMAND_R:
1431
0
            {
1432
0
                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale);
1433
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1434
0
                switch (hparams.n_layer) {
1435
0
                    case 40: type = LLM_TYPE_35B; break;
1436
0
                    default: type = LLM_TYPE_UNKNOWN;
1437
0
                }
1438
0
            } break;
1439
0
        case LLM_ARCH_COHERE2:
1440
0
            {
1441
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1442
0
                hparams.set_swa_pattern(4);
1443
1444
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1445
0
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
1446
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1447
0
                switch (hparams.n_layer) {
1448
0
                    case 32: type = LLM_TYPE_8B; break;
1449
0
                    default: type = LLM_TYPE_UNKNOWN;
1450
0
                }
1451
0
            } break;
1452
0
        case LLM_ARCH_DBRX:
1453
0
        {
1454
0
            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1455
0
            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
1456
1457
0
            switch (hparams.n_layer) {
1458
0
                case 40: type = LLM_TYPE_16x12B; break;
1459
0
                default: type = LLM_TYPE_UNKNOWN;
1460
0
            }
1461
0
        } break;
1462
0
        case LLM_ARCH_OLMO:
1463
0
            {
1464
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1465
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
1466
1467
0
                switch (hparams.n_layer) {
1468
0
                    case 22: type = LLM_TYPE_1B; break;
1469
0
                    case 32: type = LLM_TYPE_7B; break;
1470
0
                    case 80: type = LLM_TYPE_70B; break;
1471
0
                    default: type = LLM_TYPE_UNKNOWN;
1472
0
                }
1473
0
            } break;
1474
0
        case LLM_ARCH_OLMO2:
1475
0
            {
1476
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1477
1478
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1479
0
                if (found_swa && hparams.n_swa > 0) {
1480
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1481
0
                    hparams.set_swa_pattern(4);
1482
0
                } else {
1483
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1484
0
                }
1485
1486
0
                switch (hparams.n_layer) {
1487
0
                    case 16: type = LLM_TYPE_1B; break;
1488
0
                    case 32: type = LLM_TYPE_7B; break;
1489
0
                    case 40: type = LLM_TYPE_13B; break;
1490
0
                    case 64: type = LLM_TYPE_32B; break;
1491
0
                    default: type = LLM_TYPE_UNKNOWN;
1492
0
                }
1493
0
            } break;
1494
0
        case LLM_ARCH_SEED_OSS:
1495
0
            {
1496
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1497
0
                switch (hparams.n_layer) {
1498
0
                    case 64: type = LLM_TYPE_36B; break;
1499
0
                    default: type = LLM_TYPE_UNKNOWN;
1500
0
                }
1501
0
            } break;
1502
0
        case LLM_ARCH_OLMOE:
1503
0
            {
1504
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1505
0
                switch (hparams.n_layer) {
1506
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1507
0
                    default: type = LLM_TYPE_UNKNOWN;
1508
0
                }
1509
0
            } break;
1510
0
        case LLM_ARCH_OPENELM:
1511
0
            {
1512
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1513
1514
0
                switch (hparams.n_layer) {
1515
0
                case 16: type = LLM_TYPE_270M; break;
1516
0
                case 20: type = LLM_TYPE_450M; break;
1517
0
                case 28: type = LLM_TYPE_1B; break;
1518
0
                case 36: type = LLM_TYPE_3B; break;
1519
0
                default: type = LLM_TYPE_UNKNOWN;
1520
0
                }
1521
0
            } break;
1522
0
        case LLM_ARCH_GPTNEOX:
1523
0
            {
1524
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1525
0
                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
1526
0
                switch (hparams.n_layer) {
1527
0
                    case 6:
1528
0
                        switch (hparams.n_ff()) {
1529
0
                            case 512:  type = LLM_TYPE_14M; break;
1530
0
                            case 2048: type = LLM_TYPE_70M; break;
1531
0
                            default:   type = LLM_TYPE_UNKNOWN;
1532
0
                        } break;
1533
0
                    case 12:
1534
0
                        switch (hparams.n_ff()) {
1535
0
                            case 3072: type = LLM_TYPE_160M; break;
1536
0
                            default: type = LLM_TYPE_UNKNOWN;
1537
0
                        } break;
1538
0
                    case 16:
1539
0
                        switch (hparams.n_ff()) {
1540
0
                            case 8192: type = LLM_TYPE_1B; break;
1541
0
                            default: type = LLM_TYPE_UNKNOWN;
1542
0
                        } break;
1543
0
                    case 24:
1544
0
                        switch (hparams.n_ff()) {
1545
0
                            case 4096: type = LLM_TYPE_410M; break;
1546
0
                            case 8192: type = LLM_TYPE_1_4B; break;
1547
0
                            default: type = LLM_TYPE_UNKNOWN;
1548
0
                        } break;
1549
0
                    case 32:
1550
0
                        switch (hparams.n_ff()) {
1551
0
                            case 10240: type = LLM_TYPE_2_8B; break;
1552
0
                            case 16384: type = LLM_TYPE_6_9B; break;
1553
0
                            default: type = LLM_TYPE_UNKNOWN;
1554
0
                        } break;
1555
0
                    case 36:
1556
0
                        switch (hparams.n_ff()) {
1557
0
                            case 20480: type = LLM_TYPE_12B; break;
1558
0
                            default: type = LLM_TYPE_UNKNOWN;
1559
0
                        } break;
1560
0
                    case 44:
1561
0
                        switch (hparams.n_ff()) {
1562
0
                            case 24576: type = LLM_TYPE_20B; break;
1563
0
                            default: type = LLM_TYPE_UNKNOWN;
1564
0
                        } break;
1565
0
                    default: type = LLM_TYPE_UNKNOWN;
1566
0
                }
1567
0
            } break;
1568
0
        case LLM_ARCH_ARCTIC:
1569
0
            {
1570
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1571
1572
0
                if (hparams.n_expert == 128) {
1573
0
                    switch (hparams.n_layer) {
1574
0
                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
1575
0
                        default: type = LLM_TYPE_UNKNOWN;
1576
0
                    }
1577
0
                } else {
1578
0
                    type = LLM_TYPE_UNKNOWN;
1579
0
                }
1580
0
            } break;
1581
0
        case LLM_ARCH_DEEPSEEK:
1582
0
            {
1583
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1584
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1585
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1586
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1587
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1588
1589
0
                switch (hparams.n_layer) {
1590
0
                    case 28: type = LLM_TYPE_20B; break;
1591
0
                    default: type = LLM_TYPE_UNKNOWN;
1592
0
                }
1593
0
            } break;
1594
0
        case LLM_ARCH_DEEPSEEK2:
1595
0
            {
1596
                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
1597
0
                bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1598
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1599
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1600
0
                if (!is_lite) {
1601
0
                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1602
0
                }
1603
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
1604
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla, false);
1605
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1606
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1607
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
1608
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
1609
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
1610
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
1611
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1612
                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1613
                    // that have no expert_gating_func model parameter set
1614
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1615
0
                }
1616
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1617
1618
0
                switch (hparams.n_layer) {
1619
0
                    case 27: type = LLM_TYPE_16B; break;
1620
0
                    case 60: type = LLM_TYPE_236B; break;
1621
0
                    case 61: type = LLM_TYPE_671B; break;
1622
0
                    default: type = LLM_TYPE_UNKNOWN;
1623
0
                }
1624
0
            } break;
1625
0
        case LLM_ARCH_PLM:
1626
0
            {
1627
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1628
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1629
0
                switch (hparams.n_layer) {
1630
0
                    case 32: type = LLM_TYPE_1_8B; break;
1631
0
                    default: type = LLM_TYPE_UNKNOWN;
1632
0
                }
1633
0
            } break;
1634
0
        case LLM_ARCH_CHATGLM:
1635
0
            {
1636
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1637
0
                switch (hparams.n_layer) {
1638
0
                    case 28: {
1639
0
                        if (hparams.n_head(0) == 16) {
1640
0
                            type = LLM_TYPE_1_5B;
1641
0
                        } else {
1642
0
                            type = LLM_TYPE_6B;
1643
0
                        }
1644
0
                    } break;
1645
0
                    case 40: {
1646
0
                        if (hparams.n_head(0) == 24) {
1647
0
                            type = LLM_TYPE_4B;
1648
0
                        } else {
1649
0
                            type = LLM_TYPE_9B;
1650
0
                        }
1651
0
                    } break;
1652
0
                    default: type = LLM_TYPE_UNKNOWN;
1653
0
                }
1654
0
            } break;
1655
0
        case LLM_ARCH_GLM4:
1656
0
            {
1657
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1658
0
                switch (hparams.n_layer) {
1659
0
                    case 40: type = LLM_TYPE_9B; break;
1660
0
                    case 61: type = LLM_TYPE_32B; break;
1661
0
                    default: type = LLM_TYPE_UNKNOWN;
1662
0
                }
1663
0
            } break;
1664
0
        case LLM_ARCH_GLM4_MOE:
1665
0
            {
1666
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1667
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1668
1669
                // MoE parameters
1670
0
                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
1671
0
                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
1672
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1673
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
1674
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1675
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
1676
1677
                // Expert gating function (GLM-4.5 uses sigmoid)
1678
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
1679
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1680
0
                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1681
0
                }
1682
1683
                // NextN/MTP parameters
1684
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
1685
1686
                // TODO: when MTP is implemented, this should probably be updated if needed
1687
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1688
1689
0
                switch (hparams.n_layer) {
1690
0
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1691
0
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1692
0
                    default: type = LLM_TYPE_UNKNOWN;
1693
0
                }
1694
0
            } break;
1695
0
        case LLM_ARCH_BITNET:
1696
0
            {
1697
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1698
1699
0
                switch (hparams.n_layer) {
1700
0
                    case 26: type = LLM_TYPE_3B; break;
1701
0
                    default: type = LLM_TYPE_UNKNOWN;
1702
0
                }
1703
0
            } break;
1704
0
        case LLM_ARCH_T5:
1705
0
            {
1706
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
1707
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1708
1709
0
                uint32_t dec_start_token_id;
1710
0
                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
1711
0
                    hparams.dec_start_token_id = dec_start_token_id;
1712
0
                }
1713
1714
0
                hparams.dec_n_layer = hparams.n_layer;
1715
0
                ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1716
1717
0
                switch (hparams.n_layer) {
1718
0
                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
1719
0
                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
1720
0
                    case 12:
1721
0
                        switch (hparams.n_ff()) {
1722
0
                            case 3072: type = LLM_TYPE_220M; break; // t5-base
1723
0
                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1724
0
                            default: type = LLM_TYPE_UNKNOWN;
1725
0
                        } break;
1726
0
                    case 24:
1727
0
                        switch (hparams.n_ff()) {
1728
0
                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
1729
0
                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
1730
0
                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
1731
0
                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
1732
0
                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
1733
0
                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
1734
0
                            default: type = LLM_TYPE_UNKNOWN;
1735
0
                        } break;
1736
0
                    default: type = LLM_TYPE_UNKNOWN;
1737
0
               }
1738
0
            } break;
1739
0
        case LLM_ARCH_T5ENCODER:
1740
0
            {
1741
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1742
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1743
0
                type = LLM_TYPE_UNKNOWN;
1744
0
            } break;
1745
0
        case LLM_ARCH_JAIS:
1746
0
            {
1747
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1748
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1749
1750
0
                switch (hparams.n_layer) {
1751
0
                    case 24: type = LLM_TYPE_1_3B; break;
1752
0
                    case 40: type = LLM_TYPE_13B; break;
1753
                    /* TODO: add variants */
1754
0
                    default: type = LLM_TYPE_UNKNOWN;
1755
0
                }
1756
0
            } break;
1757
0
        case LLM_ARCH_NEMOTRON:
1758
0
            {
1759
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1760
0
                switch (hparams.n_layer) {
1761
0
                    case 32: type = LLM_TYPE_4B; break;
1762
0
                    default: type = LLM_TYPE_UNKNOWN;
1763
0
                }
1764
0
            } break;
1765
0
        case LLM_ARCH_NEMOTRON_H:
1766
0
            {
1767
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1768
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1769
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1770
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1771
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1772
1773
                // A layer is recurrent IFF the n_head_kv value is set to 0 and
1774
                // the n_ff value is set to 0
1775
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1776
0
                    hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1777
0
                }
1778
1779
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1780
1781
0
                switch (hparams.n_layer) {
1782
0
                    case 56: type = LLM_TYPE_9B; break;
1783
0
                    default: type = LLM_TYPE_UNKNOWN;
1784
0
                }
1785
0
            } break;
1786
0
        case LLM_ARCH_EXAONE:
1787
0
            {
1788
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1789
1790
0
                switch (hparams.n_layer) {
1791
0
                    case 32: type = LLM_TYPE_8B; break;
1792
0
                    default: type = LLM_TYPE_UNKNOWN;
1793
0
                }
1794
0
            } break;
1795
0
        case LLM_ARCH_EXAONE4:
1796
0
            {
1797
0
                if (hparams.n_layer == 64) {    // 32B
1798
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1799
0
                    hparams.n_swa = 4096;
1800
0
                    hparams.set_swa_pattern(4);
1801
0
                }
1802
1803
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1804
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1805
1806
0
                switch (hparams.n_layer) {
1807
0
                    case 30: type = LLM_TYPE_1_2B; break;
1808
0
                    case 64: type = LLM_TYPE_32B; break;
1809
0
                    default: type = LLM_TYPE_UNKNOWN;
1810
0
                }
1811
0
            } break;
1812
0
        case LLM_ARCH_RWKV6:
1813
0
        case LLM_ARCH_RWKV6QWEN2:
1814
0
            {
1815
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
1816
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
1817
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
1818
0
                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
1819
0
                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
1820
0
                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
1821
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
1822
1823
0
                switch (hparams.n_layer) {
1824
0
                    case 24: type = LLM_TYPE_1_6B; break;
1825
0
                    case 32:
1826
0
                        switch (hparams.n_embd) {
1827
0
                            case 2560: type = LLM_TYPE_3B; break;
1828
0
                            case 4096: type = LLM_TYPE_7B; break;
1829
0
                            default: type = LLM_TYPE_UNKNOWN;
1830
0
                        } break;
1831
0
                    case 61: type = LLM_TYPE_14B; break;
1832
0
                    case 64: type = LLM_TYPE_32B; break;
1833
0
                    default: type = LLM_TYPE_UNKNOWN;
1834
0
                }
1835
0
            } break;
1836
0
        case LLM_ARCH_RWKV7:
1837
0
        case LLM_ARCH_ARWKV7:
1838
0
            {
1839
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
1840
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
1841
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
1842
0
                ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
1843
0
                ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
1844
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
1845
0
                ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
1846
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
1847
1848
0
                switch (hparams.n_layer) {
1849
0
                    case 12:
1850
0
                        switch (hparams.n_embd) {
1851
0
                            case 768: type = LLM_TYPE_190M; break;
1852
0
                            default: type = LLM_TYPE_UNKNOWN;
1853
0
                        } break;
1854
0
                    case 24:
1855
0
                        switch (hparams.n_embd) {
1856
0
                            case 1024: type = LLM_TYPE_450M; break;
1857
0
                            case 2048: type = LLM_TYPE_1_5B; break;
1858
0
                            default: type = LLM_TYPE_UNKNOWN;
1859
0
                        } break;
1860
0
                    case 28:
1861
0
                        switch (hparams.n_embd) {
1862
0
                            case 1536: type = LLM_TYPE_1_5B; break;
1863
0
                            case 3584: type = LLM_TYPE_7B; break;
1864
0
                            default: type = LLM_TYPE_UNKNOWN;
1865
0
                        } break;
1866
0
                    case 32:
1867
0
                        switch (hparams.n_embd) {
1868
0
                            case 2560: type = LLM_TYPE_2_9B; break;
1869
0
                            case 4096: type = LLM_TYPE_7B; break;
1870
0
                            default: type = LLM_TYPE_UNKNOWN;
1871
0
                        } break;
1872
0
                    case 61:
1873
0
                        switch (hparams.n_embd) {
1874
0
                            case 4096: type = LLM_TYPE_14B; break;
1875
0
                            default: type = LLM_TYPE_UNKNOWN;
1876
0
                        } break;
1877
0
                    default: type = LLM_TYPE_UNKNOWN;
1878
0
                }
1879
0
            } break;
1880
0
        case LLM_ARCH_GRANITE:
1881
0
        case LLM_ARCH_GRANITE_MOE:
1882
0
            {
1883
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1884
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
1885
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
1886
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
1887
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
1888
1889
                // Granite uses rope_finetuned as a switch for rope, so default to true
1890
0
                bool rope_finetuned = true;
1891
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1892
0
                hparams.rope_finetuned = rope_finetuned;
1893
1894
0
                switch (hparams.n_layer) {
1895
0
                    case 32: type = LLM_TYPE_3B; break;
1896
0
                    case 40: type = LLM_TYPE_3B; break;
1897
                    // Add additional layer/vocab/etc checks here for other model sizes
1898
0
                    default: type = LLM_TYPE_UNKNOWN;
1899
0
                }
1900
1901
                // For Granite MoE Shared
1902
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1903
0
            } break;
1904
0
        case LLM_ARCH_GRANITE_HYBRID:
1905
0
            {
1906
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1907
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
1908
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
1909
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
1910
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
1911
1912
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1913
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1914
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1915
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1916
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1917
1918
                // Granite uses rope_finetuned as a switch for rope, so default to true
1919
0
                bool rope_finetuned = true;
1920
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1921
0
                hparams.rope_finetuned = rope_finetuned;
1922
1923
                // A layer is recurrent IFF the n_head_kv value is set to 0
1924
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1925
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1926
0
                }
1927
1928
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1929
1930
0
                switch (hparams.n_embd) {
1931
0
                    case 768: type = LLM_TYPE_350M; break;
1932
0
                    case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
1933
0
                    case 2048: case 2560: type = LLM_TYPE_3B; break;
1934
0
                    case 4096: type = LLM_TYPE_32B; break;
1935
0
                    default: type = LLM_TYPE_UNKNOWN;
1936
0
                }
1937
1938
                // For Granite MoE Shared
1939
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1940
0
            } break;
1941
0
        case LLM_ARCH_CHAMELEON:
1942
0
            {
1943
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1944
0
                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
1945
0
                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
1946
1947
0
                switch (hparams.n_layer) {
1948
0
                    case 32: type = LLM_TYPE_7B; break;
1949
0
                    case 48: type = LLM_TYPE_34B; break;
1950
0
                    default: type = LLM_TYPE_UNKNOWN;
1951
0
               }
1952
0
            } break;
1953
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
1954
0
            {
1955
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1956
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
1957
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1958
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
1959
0
            } break;
1960
0
        case LLM_ARCH_BAILINGMOE:
1961
0
            {
1962
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1963
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1964
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1965
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1966
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1967
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
1968
1969
0
                switch (hparams.n_layer) {
1970
0
                    case 28: type = LLM_TYPE_16B; break;
1971
0
                    case 88: type = LLM_TYPE_290B; break;
1972
0
                    default: type = LLM_TYPE_UNKNOWN;
1973
0
                }
1974
0
            } break;
1975
0
        case LLM_ARCH_BAILINGMOE2:
1976
0
            {
1977
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
1978
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
1979
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
1980
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1981
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
1982
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
1983
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
1984
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
1985
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
1986
1987
                // TODO: when MTP is implemented, this should probably be updated if needed
1988
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1989
1990
0
                switch (hparams.n_layer) {
1991
0
                    case 20: type = LLM_TYPE_16B_A1B; break;
1992
0
                    case 21: type = LLM_TYPE_16B_A1B; break;
1993
0
                    case 32: type = LLM_TYPE_100B_A6B; break;
1994
0
                    case 33: type = LLM_TYPE_100B_A6B; break;
1995
0
                    default: type = LLM_TYPE_UNKNOWN;
1996
0
                }
1997
0
            } break;
1998
0
        case LLM_ARCH_DOTS1:
1999
0
            {
2000
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2001
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2002
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2003
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2004
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
2005
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2006
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2007
0
                switch (hparams.n_layer) {
2008
0
                    case 62: type = LLM_TYPE_142B; break;
2009
0
                    default: type = LLM_TYPE_UNKNOWN;
2010
0
                }
2011
0
            } break;
2012
0
        case LLM_ARCH_ERNIE4_5:
2013
0
        case LLM_ARCH_ERNIE4_5_MOE:
2014
0
            {
2015
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2016
0
                if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2017
0
                    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2018
0
                    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2019
0
                    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
2020
0
                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2021
0
                }
2022
2023
0
                switch (hparams.n_layer) {
2024
0
                    case 18: type = LLM_TYPE_0_3B; break;
2025
0
                    case 28: type = LLM_TYPE_21B_A3B; break;
2026
0
                    case 54: type = LLM_TYPE_300B_A47B; break;
2027
0
                    default: type = LLM_TYPE_UNKNOWN;
2028
0
                }
2029
0
            } break;
2030
0
        case LLM_ARCH_FALCON_H1:
2031
0
            {
2032
                // Common parameters
2033
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2034
2035
                // SSM parameters
2036
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2037
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2038
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2039
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2040
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2041
2042
0
                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
2043
2044
0
                switch (hparams.n_layer) {
2045
0
                    case 36:
2046
0
                        type = LLM_TYPE_0_5B; break;
2047
0
                    case 24:
2048
0
                        type = LLM_TYPE_1_5B; break;
2049
0
                    case 66:
2050
0
                        type = LLM_TYPE_1B; break;
2051
0
                    case 32:
2052
0
                        type = LLM_TYPE_3B; break;
2053
0
                    case 44:
2054
0
                        type = LLM_TYPE_7B; break;
2055
0
                    case 72:
2056
0
                        type = LLM_TYPE_34B; break;
2057
0
                    default:
2058
0
                        type = LLM_TYPE_UNKNOWN;
2059
0
                }
2060
0
            } break;
2061
0
        case LLM_ARCH_HUNYUAN_MOE:
2062
0
            {
2063
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2064
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2065
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2066
2067
0
                switch (hparams.n_layer) {
2068
0
                    case 32: type = LLM_TYPE_A13B; break;
2069
0
                    default: type = LLM_TYPE_UNKNOWN;
2070
0
                }
2071
0
            } break;
2072
0
        case LLM_ARCH_HUNYUAN_DENSE:
2073
0
            {
2074
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2075
2076
0
                switch (hparams.n_embd) {
2077
0
                    case 1024: type = LLM_TYPE_0_5B; break;
2078
0
                    case 2048: type = LLM_TYPE_1_8B; break;
2079
0
                    case 3072: type = LLM_TYPE_4B; break;
2080
0
                    case 4096: type = LLM_TYPE_7B; break;
2081
0
                    default: type = LLM_TYPE_UNKNOWN;
2082
0
                }
2083
0
            } break;
2084
0
        case LLM_ARCH_SMOLLM3:
2085
0
            {
2086
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2087
0
                hparams.n_no_rope_layer_step = 4;
2088
2089
0
                switch (hparams.n_layer) {
2090
0
                    case 36: type = LLM_TYPE_3B; break;
2091
0
                    default: type = LLM_TYPE_UNKNOWN;
2092
0
                }
2093
0
            } break;
2094
0
        case LLM_ARCH_OPENAI_MOE:
2095
0
            {
2096
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2097
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2098
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
2099
2100
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2101
0
                hparams.set_swa_pattern(2);
2102
2103
0
                switch (hparams.n_layer) {
2104
0
                    case 24: type = LLM_TYPE_20B; break;
2105
0
                    case 36: type = LLM_TYPE_120B; break;
2106
0
                    default: type = LLM_TYPE_UNKNOWN;
2107
0
                }
2108
0
            } break;
2109
0
        case LLM_ARCH_LFM2:
2110
0
            {
2111
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2112
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2113
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2114
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2115
0
                }
2116
0
                hparams.n_layer_dense_lead = hparams.n_layer;
2117
0
                switch (hparams.n_ff()) {
2118
0
                    case  4608: type = LLM_TYPE_350M; break;
2119
0
                    case  6912: type = LLM_TYPE_700M; break;
2120
0
                    case  8192: type = LLM_TYPE_1_2B; break;
2121
0
                    case 10752: type = LLM_TYPE_2_6B; break;
2122
0
                    default:    type = LLM_TYPE_UNKNOWN;
2123
0
                }
2124
0
            } break;
2125
0
        case LLM_ARCH_LFM2MOE:
2126
0
            {
2127
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2128
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2129
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2130
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2131
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
2132
2133
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2134
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2135
0
                }
2136
2137
0
                type = LLM_TYPE_8B_A1B;
2138
0
            } break;
2139
0
        case LLM_ARCH_SMALLTHINKER:
2140
0
            {
2141
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2142
2143
0
                if (found_swa && hparams.n_swa > 0) {
2144
0
                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
2145
0
                    hparams.n_swa         = 4096;
2146
0
                    hparams.set_swa_pattern(4, true);
2147
0
                } else {
2148
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
2149
0
                    hparams.n_no_rope_layer_step = hparams.n_layer;
2150
0
                }
2151
2152
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
2153
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2154
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2155
2156
0
                switch (hparams.n_layer) {
2157
0
                    case 32: type = LLM_TYPE_4B;  break;
2158
0
                    case 52: type = LLM_TYPE_20B; break;
2159
0
                    default: type = LLM_TYPE_UNKNOWN;
2160
0
                }
2161
0
            } break;
2162
0
        case LLM_ARCH_GROVEMOE:
2163
0
            {
2164
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2165
0
                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp);
2166
0
                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
2167
0
                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
2168
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2169
2170
0
                switch (hparams.n_layer) {
2171
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
2172
0
                    default: type = LLM_TYPE_UNKNOWN;
2173
0
                }
2174
0
            } break;
2175
0
        case LLM_ARCH_APERTUS:
2176
0
            {
2177
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2178
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
2179
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
2180
0
                ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
2181
0
                ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
2182
2183
0
                switch (hparams.n_layer) {
2184
0
                    case 32: type = LLM_TYPE_8B; break;
2185
0
                    default: type = LLM_TYPE_UNKNOWN;
2186
0
                }
2187
0
            } break;
2188
0
        case LLM_ARCH_MINIMAX_M2:
2189
0
            {
2190
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
2191
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
2192
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
2193
2194
0
                switch (hparams.n_layer) {
2195
0
                    case 62: type = LLM_TYPE_230B_A10B; break;
2196
0
                    default: type = LLM_TYPE_UNKNOWN;
2197
0
                }
2198
0
            } break;
2199
0
        case LLM_ARCH_COGVLM:
2200
0
            {
2201
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2202
0
                switch (hparams.n_layer) {
2203
0
                    case 32: type = LLM_TYPE_13B; break;
2204
0
                    default: type = LLM_TYPE_UNKNOWN;
2205
0
                }
2206
0
            } break;
2207
0
        case LLM_ARCH_PANGU_EMBED:
2208
0
            {
2209
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2210
0
                switch (hparams.n_layer) {
2211
0
                    case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2212
0
                    case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2213
0
                    default: type = LLM_TYPE_UNKNOWN;
2214
0
                }
2215
0
            } break;
2216
0
        default: throw std::runtime_error("unsupported model architecture");
2217
0
    }
2218
2219
0
    pimpl->n_bytes = ml.n_bytes;
2220
2221
0
    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2222
2223
0
    if (hparams.f_max_alibi_bias > 0.0f) {
2224
0
        hparams.use_alibi = true;
2225
0
    }
2226
2227
0
    hparams.rope_type = llama_model_rope_type(this);
2228
0
}
2229
2230
0
void llama_model::load_vocab(llama_model_loader & ml) {
2231
0
    const auto kv = LLM_KV(arch);
2232
2233
0
    vocab.load(ml, kv);
2234
0
}
2235
2236
0
bool llama_model::load_tensors(llama_model_loader & ml) {
2237
0
    const auto & split_mode   = params.split_mode;
2238
0
    const auto & n_gpu_layers = params.n_gpu_layers;
2239
0
    const auto & use_mlock    = params.use_mlock;
2240
0
    const auto & tensor_split = params.tensor_split;
2241
2242
0
    const int n_layer = hparams.n_layer;
2243
2244
0
    const bool use_mmap_buffer = true;
2245
2246
0
    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2247
2248
    // build a list of buffer types for the CPU and GPU devices
2249
0
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2250
0
    for (auto * dev : devices) {
2251
0
        buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2252
        // add CPU buffer types as a fallback
2253
0
        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
2254
0
        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2255
0
    }
2256
2257
    // calculate the split points
2258
0
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2259
0
    std::vector<float> splits(n_devices());
2260
0
    if (all_zero) {
2261
        // default split, by free memory
2262
0
        for (size_t i = 0; i < n_devices(); ++i) {
2263
0
            ggml_backend_dev_t dev = devices[i];
2264
0
            size_t total;
2265
0
            size_t free;
2266
0
            ggml_backend_dev_memory(dev, &free, &total);
2267
0
            splits[i] = free;
2268
0
        }
2269
0
    } else {
2270
0
        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
2271
0
    }
2272
2273
    // sum and normalize the splits to get the split points
2274
0
    float split_sum = 0.0f;
2275
0
    for (size_t i = 0; i < n_devices(); ++i) {
2276
0
        split_sum += splits[i];
2277
0
        splits[i] = split_sum;
2278
0
    }
2279
0
    for (size_t i = 0; i < n_devices(); ++i) {
2280
0
        splits[i] /= split_sum;
2281
0
    }
2282
2283
0
    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2284
0
    if (cpu_dev == nullptr) {
2285
0
        throw std::runtime_error(format("%s: no CPU backend found", __func__));
2286
0
    }
2287
0
    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
2288
0
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
2289
0
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2290
0
        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
2291
0
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2292
0
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2293
0
            return {cpu_dev, &pimpl->cpu_buft_list};
2294
0
        }
2295
0
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
2296
0
        auto * dev = devices.at(layer_gpu);
2297
0
        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
2298
0
        return {dev, &pimpl->gpu_buft_list.at(dev)};
2299
0
    };
2300
2301
    // assign the input layer
2302
    // there is very little benefit to offloading the input layer, so always keep it on the CPU
2303
0
    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
2304
2305
    // assign the repeating layers to the devices according to the splits
2306
0
    pimpl->dev_layer.resize(n_layer);
2307
0
    for (int il = 0; il < n_layer; ++il) {
2308
0
        pimpl->dev_layer[il] = get_layer_buft_list(il);
2309
0
    }
2310
2311
    // assign the output layer
2312
0
    pimpl->dev_output = get_layer_buft_list(n_layer);
2313
2314
    // one ggml context per buffer type
2315
0
    int max_n_tensors = ml.n_tensors;
2316
0
    max_n_tensors += 1;         // duplicated output tensor
2317
0
    max_n_tensors += n_layer*2; // duplicated rope freq tensors
2318
0
    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2319
2320
    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2321
0
    struct ggml_backend_buft_comparator {
2322
0
        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2323
0
            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2324
0
        }
2325
0
    };
2326
0
    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2327
2328
0
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2329
0
        auto it = ctx_map.find(buft);
2330
0
        if (it == ctx_map.end()) {
2331
0
            ggml_init_params params = {
2332
0
                /*.mem_size   =*/ ctx_size,
2333
0
                /*.mem_buffer =*/ NULL,
2334
0
                /*.no_alloc   =*/ true,
2335
0
            };
2336
2337
0
            ggml_context * ctx = ggml_init(params);
2338
0
            if (!ctx) {
2339
0
                throw std::runtime_error(format("failed to create ggml context"));
2340
0
            }
2341
2342
0
            ctx_map.emplace(buft, ctx);
2343
2344
0
            return ctx;
2345
0
        }
2346
0
        return it->second.get();
2347
0
    };
2348
2349
0
    const auto TENSOR_DUPLICATED   = llama_model_loader::TENSOR_DUPLICATED;
2350
0
    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2351
0
    const auto TENSOR_SKIP         = llama_model_loader::TENSOR_SKIP;
2352
2353
    // create tensors for the weights
2354
0
    {
2355
        // note: cast to int64_t since we will use these for the tensor dimensions
2356
0
        const int64_t n_head        = hparams.n_head();
2357
0
        const int64_t n_head_kv     = hparams.n_head_kv();
2358
0
        const int64_t n_embd        = hparams.n_embd;
2359
0
        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
2360
0
        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
2361
0
        const int64_t n_embd_head_k = hparams.n_embd_head_k;
2362
0
        const int64_t n_embd_head_v = hparams.n_embd_head_v;
2363
0
        const int64_t n_ff          = hparams.n_ff();
2364
0
        const int64_t n_embd_gqa    = n_embd_v_gqa;
2365
0
        const int64_t n_vocab       = vocab.n_tokens();
2366
0
        const int64_t n_token_types = vocab.n_token_types();
2367
0
        const int64_t n_rot         = hparams.n_rot;
2368
0
        const int64_t n_expert      = hparams.n_expert;
2369
0
        const int64_t n_expert_used = hparams.n_expert_used;
2370
0
        const int64_t n_ctx_train   = hparams.n_ctx_train;
2371
2372
0
        if (n_expert > 0 && hparams.n_expert_used == 0) {
2373
0
            throw std::runtime_error("model has expert layers but no expert layers are used");
2374
0
        }
2375
2376
0
        int n_moved_tensors = 0;
2377
0
        ggml_tensor * first_moved_tensor = nullptr;
2378
0
        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2379
0
        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2380
2381
0
        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2382
0
            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2383
2384
0
            if (!t_meta) {
2385
0
                if (flags & TENSOR_NOT_REQUIRED) {
2386
0
                    return nullptr;
2387
0
                }
2388
0
                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
2389
0
            }
2390
2391
            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2392
            // the tensor is duplicated
2393
            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2394
0
            llm_tensor tn_tensor = tn.tensor;
2395
0
            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2396
0
                tn_tensor = LLM_TENSOR_OUTPUT;
2397
0
            }
2398
2399
0
            llm_tensor_info info;
2400
0
            try {
2401
0
                info = llm_tensor_info_for(tn_tensor);
2402
0
            } catch (const std::out_of_range & e) {
2403
0
                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
2404
0
            }
2405
2406
            // skip unused tensors
2407
0
            if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2408
0
                const size_t nbytes = ggml_nbytes(t_meta);
2409
0
                LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2410
2411
0
                ml.size_data -= nbytes;
2412
0
                ml.n_created++;
2413
2414
0
                return nullptr;
2415
0
            }
2416
2417
            // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2418
0
            ggml_op op;
2419
0
            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2420
0
            if (bias) {
2421
0
                if (info.op == GGML_OP_MUL_MAT_ID) {
2422
0
                    op = GGML_OP_ADD_ID;
2423
0
                } else {
2424
0
                    op = GGML_OP_ADD;
2425
0
                }
2426
0
            } else {
2427
0
                op = info.op;
2428
0
            }
2429
2430
            // sanity checks
2431
0
            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2432
0
                if (tn.bid != -1) {
2433
0
                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2434
0
                }
2435
0
            } else {
2436
0
                if (tn.bid == -1) {
2437
0
                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2438
0
                }
2439
0
            }
2440
2441
            // select the buffer type for this tensor
2442
0
            buft_list_t * buft_list;
2443
0
            switch (info.layer) {
2444
0
                case LLM_TENSOR_LAYER_INPUT:
2445
0
                    buft_list = pimpl->dev_input.buft_list;
2446
0
                    break;
2447
0
                case LLM_TENSOR_LAYER_OUTPUT:
2448
0
                    buft_list = pimpl->dev_output.buft_list;
2449
0
                    break;
2450
0
                case LLM_TENSOR_LAYER_REPEATING:
2451
0
                    buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
2452
0
                    break;
2453
0
                default:
2454
0
                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2455
0
            }
2456
2457
0
            ggml_backend_buffer_type_t buft = nullptr;
2458
2459
            // check overrides
2460
0
            if (ml.tensor_buft_overrides) {
2461
0
                std::string tensor_name = tn.str();
2462
0
                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2463
0
                    std::regex pattern(overrides->pattern);
2464
0
                    if (std::regex_search(tensor_name, pattern)) {
2465
0
                        if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2466
                            // when overriding to a CPU buffer, consider the extra buffer types
2467
0
                            buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2468
0
                        } else {
2469
0
                            buft = overrides->buft;
2470
0
                        }
2471
2472
0
                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2473
0
                                tensor_name.c_str(),
2474
0
                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2475
0
                                ggml_backend_buft_name(buft));
2476
0
                        break;
2477
0
                    }
2478
0
                }
2479
0
            }
2480
2481
0
            if (!buft) {
2482
0
                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
2483
0
                if (!buft) {
2484
0
                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2485
0
                }
2486
0
            }
2487
2488
            // avoid using a host buffer when using mmap
2489
0
            auto * buft_dev = ggml_backend_buft_get_device(buft);
2490
0
            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2491
0
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2492
0
                if (!cpu_dev) {
2493
0
                    throw std::runtime_error("no CPU backend found");
2494
0
                }
2495
0
                buft = ggml_backend_dev_buffer_type(cpu_dev);
2496
0
            }
2497
2498
0
            if (buft != buft_list->front().second) {
2499
0
                n_moved_tensors++;
2500
0
                if (!first_moved_tensor) {
2501
0
                    first_moved_tensor = t_meta;
2502
0
                    first_moved_from_buft = buft_list->front().second;
2503
0
                    first_moved_to_buft   = buft;
2504
0
                }
2505
0
            }
2506
2507
0
            ggml_context * ctx = ctx_for_buft(buft);
2508
2509
            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2510
0
            if (flags & TENSOR_DUPLICATED) {
2511
0
                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
2512
0
                if (t) {
2513
0
                    return t;
2514
0
                }
2515
0
            }
2516
0
            return ml.create_tensor(ctx, tn, ne, flags);
2517
0
        };
2518
2519
0
        layers.resize(n_layer);
2520
2521
        // TODO: move to a separate function
2522
0
        const auto tn = LLM_TN(arch);
2523
0
        switch (arch) {
2524
0
            case LLM_ARCH_LLAMA:
2525
0
            case LLM_ARCH_REFACT:
2526
0
            case LLM_ARCH_MINICPM:
2527
0
            case LLM_ARCH_GRANITE:
2528
0
            case LLM_ARCH_GRANITE_MOE:
2529
0
                {
2530
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2531
2532
                    // output
2533
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2534
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2535
2536
                    // if output is NULL, init from the input tok embed
2537
0
                    if (output == NULL) {
2538
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2539
0
                    }
2540
2541
0
                    for (int i = 0; i < n_layer; ++i) {
2542
0
                        auto & layer = layers[i];
2543
2544
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2545
2546
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2547
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2548
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2549
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2550
2551
                        // optional bias tensors
2552
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2553
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2554
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2555
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2556
2557
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2558
2559
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2560
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2561
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2562
0
                        }
2563
0
                        else {
2564
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2565
0
                        }
2566
2567
0
                        if (n_expert == 0) {
2568
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2569
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2570
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2571
2572
                            // optional MLP bias
2573
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2574
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2575
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2576
0
                        } else {
2577
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2578
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2579
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
2580
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
2581
2582
                            // For Granite MoE Shared
2583
0
                            if (hparams.n_ff_shexp > 0) {
2584
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2585
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2586
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
2587
0
                            }
2588
0
                        }
2589
0
                    }
2590
0
                } break;
2591
0
            case LLM_ARCH_LLADA:
2592
0
                {
2593
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2594
2595
                    // output
2596
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2597
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2598
2599
                    // if output is NULL, init from the input tok embed
2600
0
                    if (output == NULL) {
2601
0
                        output =
2602
0
                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2603
0
                    }
2604
2605
0
                    for (int i = 0; i < n_layer; ++i) {
2606
0
                        auto & layer = layers[i];
2607
2608
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2609
2610
                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2611
0
                        layer.wq =
2612
0
                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2613
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2614
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2615
                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2616
0
                        layer.wo =
2617
0
                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2618
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2619
2620
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2621
2622
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2623
0
                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2624
2625
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2626
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2627
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2628
2629
                        // optional MLP bias
2630
0
                        layer.ffn_gate_b =
2631
0
                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2632
0
                        layer.ffn_down_b =
2633
0
                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2634
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2635
0
                    }
2636
0
                }
2637
0
                break;
2638
0
            case LLM_ARCH_LLADA_MOE:
2639
0
                {
2640
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2641
2642
                    // output
2643
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2644
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
2645
2646
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2647
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2648
2649
0
                    for (int i = 0; i < n_layer; ++i) {
2650
0
                        auto & layer = layers[i];
2651
2652
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2653
2654
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
2655
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
2656
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
2657
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2658
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2659
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2660
2661
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2662
2663
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2664
2665
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2666
2667
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
2668
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
2669
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
2670
0
                    }
2671
0
                } break;
2672
0
            case LLM_ARCH_LLAMA4:
2673
0
                {
2674
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2675
2676
                    // output
2677
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2678
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2679
2680
                    // if output is NULL, init from the input tok embed
2681
0
                    if (output == NULL) {
2682
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2683
0
                    }
2684
2685
0
                    for (int i = 0; i < n_layer; ++i) {
2686
0
                        bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2687
2688
0
                        auto & layer = layers[i];
2689
2690
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2691
2692
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2693
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2694
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2695
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2696
2697
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2698
2699
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2700
2701
0
                        if (is_moe_layer) {
2702
0
                            int n_ff_exp = hparams.n_ff_exp;
2703
2704
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2705
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2706
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
2707
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2708
2709
                            // Shared expert
2710
0
                            const int64_t n_ff_shexp = n_ff_exp;
2711
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
2712
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
2713
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
2714
0
                        } else {
2715
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2716
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2717
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2718
0
                        }
2719
0
                    }
2720
0
                } break;
2721
0
            case LLM_ARCH_DECI:
2722
0
                {
2723
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2724
2725
                    // output
2726
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2727
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2728
2729
                    // if output is NULL, init from the input tok embed
2730
0
                    if (output == NULL) {
2731
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2732
0
                    }
2733
2734
0
                    for (int i = 0; i < n_layer; ++i) {
2735
0
                        auto & layer = layers[i];
2736
0
                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
2737
0
                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
2738
0
                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
2739
0
                        const int64_t n_ff          = hparams.n_ff(i);
2740
0
                        const int64_t n_head        = hparams.n_head(i);
2741
0
                        const int64_t n_head_kv     = hparams.n_head_kv(i);
2742
2743
0
                        if (n_head_kv == 0 && n_head > 0) {
2744
                            // linear attention for DeciLMCausalModel
2745
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2746
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2747
0
                        }
2748
0
                        else if (n_head_kv > 0) {
2749
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2750
2751
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2752
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2753
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2754
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2755
0
                        }
2756
2757
                        // optional bias tensors
2758
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2759
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2760
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2761
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2762
2763
0
                        if (n_ff > 0) {
2764
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2765
0
                        }
2766
2767
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2768
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2769
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2770
0
                        }
2771
0
                        else {
2772
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2773
0
                        }
2774
2775
0
                        if (n_ff > 0) {
2776
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2777
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2778
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2779
0
                        }
2780
2781
                        // optional MLP bias
2782
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2783
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2784
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2785
0
                    }
2786
0
                } break;
2787
0
            case LLM_ARCH_MINICPM3:
2788
0
                {
2789
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
2790
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
2791
2792
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
2793
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
2794
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2795
2796
                    // output
2797
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2798
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2799
2800
                    // if output is NULL, init from the input tok embed
2801
0
                    if (output == NULL) {
2802
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2803
0
                    }
2804
2805
0
                    for (int i = 0; i < n_layer; ++i) {
2806
0
                        auto & layer = layers[i];
2807
2808
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2809
0
                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
2810
2811
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
2812
2813
0
                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
2814
0
                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
2815
2816
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
2817
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
2818
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
2819
2820
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2821
2822
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2823
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2824
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2825
2826
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2827
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2828
0
                    }
2829
0
                } break;
2830
0
            case LLM_ARCH_GROK:
2831
0
                {
2832
0
                    if (n_expert == 0) {
2833
0
                        throw std::runtime_error("Grok model cannot have zero experts");
2834
0
                    }
2835
2836
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2837
2838
                    // output
2839
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2840
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2841
2842
                    // if output is NULL, init from the input tok embed
2843
0
                    if (output == NULL) {
2844
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2845
0
                    }
2846
2847
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2848
0
                    for (int i = 0; i < n_layer; ++i) {
2849
0
                        auto & layer = layers[i];
2850
2851
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2852
2853
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
2854
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
2855
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
2856
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2857
2858
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2859
2860
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2861
2862
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2863
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
2864
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2865
2866
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2867
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2868
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
2869
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2870
2871
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2872
0
                        if (!layer.ffn_post_norm) {
2873
0
                            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2874
0
                        }
2875
0
                    }
2876
0
                } break;
2877
0
            case LLM_ARCH_DBRX:
2878
0
                {
2879
0
                    if (n_expert == 0) {
2880
0
                        throw std::runtime_error("DBRX model cannot have zero experts");
2881
0
                    }
2882
2883
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2884
2885
                    // output
2886
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2887
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
2888
2889
0
                    for (int i = 0; i < n_layer; ++i) {
2890
0
                        auto & layer = layers[i];
2891
2892
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2893
2894
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2895
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2896
2897
0
                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2898
2899
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2900
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
2901
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
2902
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
2903
0
                    }
2904
0
                } break;
2905
0
            case LLM_ARCH_BAICHUAN:
2906
0
                {
2907
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2908
0
                    {
2909
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2910
0
                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
2911
0
                    }
2912
2913
0
                    for (int i = 0; i < n_layer; ++i) {
2914
0
                        auto & layer = layers[i];
2915
2916
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2917
2918
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
2919
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
2920
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
2921
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2922
2923
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2924
2925
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2926
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2927
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2928
0
                    }
2929
0
                } break;
2930
0
            case LLM_ARCH_FALCON:
2931
0
                {
2932
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2933
2934
                    // output
2935
0
                    {
2936
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2937
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
2938
2939
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2940
0
                        if (!output) {
2941
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
2942
0
                        }
2943
0
                    }
2944
2945
0
                    for (int i = 0; i < n_layer; ++i) {
2946
0
                        auto & layer = layers[i];
2947
2948
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2949
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
2950
2951
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2952
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
2953
2954
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2955
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2956
2957
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2958
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2959
0
                    }
2960
0
                } break;
2961
0
            case LLM_ARCH_STARCODER:
2962
0
                {
2963
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2964
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
2965
2966
                    // output
2967
0
                    {
2968
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2969
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
2970
0
                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2971
0
                        if (!output) {
2972
                            // needs to be on GPU
2973
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2974
0
                        }
2975
2976
0
                    }
2977
2978
0
                    for (int i = 0; i < n_layer; ++i) {
2979
0
                        auto & layer = layers[i];
2980
2981
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2982
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
2983
2984
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2985
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
2986
2987
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2988
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
2989
2990
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2991
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
2992
2993
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2994
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
2995
2996
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
2997
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
2998
0
                    }
2999
0
                } break;
3000
0
            case LLM_ARCH_BERT:
3001
0
            case LLM_ARCH_NOMIC_BERT:
3002
0
            case LLM_ARCH_NOMIC_BERT_MOE:
3003
0
            case LLM_ARCH_JINA_BERT_V3:
3004
0
                {
3005
0
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3006
0
                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3007
3008
0
                    if (arch == LLM_ARCH_BERT) {
3009
0
                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
3010
3011
0
                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3012
0
                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3013
3014
0
                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3015
0
                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3016
0
                    }
3017
3018
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3019
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3020
3021
0
                    for (int i = 0; i < n_layer; ++i) {
3022
0
                        auto & layer = layers[i];
3023
3024
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3025
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3026
3027
0
                        if (!layer.wqkv) {
3028
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3029
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
3030
3031
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3032
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
3033
3034
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3035
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
3036
0
                        }
3037
3038
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
3039
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3040
3041
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3042
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
3043
3044
0
                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3045
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
3046
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
3047
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
3048
0
                        } else {
3049
0
                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3050
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3051
0
                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3052
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3053
3054
0
                            if (arch == LLM_ARCH_NOMIC_BERT) {
3055
0
                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3056
0
                            }
3057
0
                        }
3058
3059
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3060
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
3061
0
                    }
3062
0
                } break;
3063
0
            case LLM_ARCH_NEO_BERT:
3064
0
                {
3065
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3066
3067
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3068
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3069
3070
0
                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3071
0
                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3072
3073
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3074
3075
0
                    for (int i = 0; i < n_layer; ++i) {
3076
0
                        auto & layer = layers[i];
3077
3078
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3079
3080
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3081
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3082
3083
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3084
3085
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
3086
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3087
0
                    }
3088
0
                } break;
3089
0
            case LLM_ARCH_JINA_BERT_V2:
3090
0
                {
3091
0
                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3092
0
                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3093
3094
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
3095
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
3096
3097
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3098
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
3099
0
                    for (int i = 0; i < n_layer; ++i) {
3100
0
                        auto & layer = layers[i]; // JinaBertLayer
3101
3102
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3103
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3104
3105
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3106
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3107
3108
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3109
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3110
3111
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3112
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3113
3114
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3115
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3116
3117
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3118
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
3119
3120
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3121
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
3122
3123
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3124
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3125
3126
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3127
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
3128
3129
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3130
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3131
3132
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3133
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
3134
0
                    }
3135
0
                } break;
3136
0
            case LLM_ARCH_BLOOM:
3137
0
                {
3138
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
3139
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3140
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3141
3142
                    // output
3143
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3144
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3145
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3146
3147
                    // if output is NULL, init from the input tok embed
3148
0
                    if (output == NULL) {
3149
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3150
0
                    }
3151
3152
0
                    for (int i = 0; i < n_layer; ++i) {
3153
0
                        auto & layer = layers[i];
3154
3155
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3156
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
3157
3158
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3159
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
3160
3161
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3162
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
3163
3164
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3165
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
3166
3167
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3168
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3169
3170
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3171
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
3172
0
                    }
3173
0
                } break;
3174
0
            case LLM_ARCH_MPT:
3175
0
                {
3176
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3177
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3178
3179
                    // output
3180
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3181
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
3182
3183
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3184
0
                    if (!output) {
3185
0
                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3186
0
                    }
3187
3188
0
                    for (int i = 0; i < n_layer; ++i) {
3189
0
                        auto & layer = layers[i];
3190
3191
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3192
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3193
3194
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3195
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3196
3197
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3198
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3199
3200
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3201
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3202
3203
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3204
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3205
3206
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3207
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3208
3209
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3210
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3211
3212
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3213
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3214
3215
                        // AWQ ScaleActivation layer
3216
0
                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3217
0
                    }
3218
0
                } break;
3219
0
            case LLM_ARCH_STABLELM:
3220
0
                {
3221
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3222
3223
                    // output
3224
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3225
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3226
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3227
3228
0
                    for (int i = 0; i < n_layer; ++i) {
3229
0
                        auto & layer = layers[i];
3230
3231
0
                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3232
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3233
3234
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3235
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3236
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3237
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3238
3239
                        // optional bias tensors, present in Stable LM 2 1.6B
3240
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3241
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3242
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3243
3244
                        // optional q and k layernorms, present in StableLM 2 12B
3245
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
3246
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3247
3248
                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3249
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3250
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3251
3252
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3253
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3254
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3255
0
                    }
3256
0
                } break;
3257
0
            case LLM_ARCH_QWEN:
3258
0
                {
3259
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3260
3261
                    // output
3262
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3263
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3264
3265
0
                    for (int i = 0; i < n_layer; ++i) {
3266
0
                        auto & layer = layers[i];
3267
3268
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3269
3270
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3271
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
3272
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3273
3274
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3275
3276
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3277
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3278
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
3279
0
                    }
3280
0
                } break;
3281
0
            case LLM_ARCH_QWEN2:
3282
0
            case LLM_ARCH_QWEN2VL:
3283
0
            case LLM_ARCH_DREAM:
3284
0
                {
3285
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3286
3287
                    // output
3288
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3289
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3290
0
                    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
3291
                    // if output is NULL, init from the input tok embed
3292
0
                    if (output == NULL) {
3293
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3294
0
                    }
3295
3296
0
                    for (int i = 0; i < n_layer; ++i) {
3297
0
                        auto & layer = layers[i];
3298
3299
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3300
3301
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3302
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3303
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3304
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3305
3306
                        // optional bias tensors
3307
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
3308
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
3309
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
3310
3311
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3312
3313
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3314
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3315
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3316
0
                    }
3317
0
                } break;
3318
0
            case LLM_ARCH_QWEN2MOE:
3319
0
                {
3320
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3321
3322
                    // output
3323
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3324
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3325
3326
0
                    for (int i = 0; i < n_layer; ++i) {
3327
0
                        auto & layer = layers[i];
3328
3329
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3330
3331
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3332
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3333
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3334
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3335
3336
                        // optional bias tensors
3337
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3338
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3339
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3340
3341
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3342
3343
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3344
3345
0
                        if (n_expert == 0) {
3346
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3347
0
                        }
3348
0
                        if (n_expert_used == 0) {
3349
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3350
0
                        }
3351
3352
                        // MoE branch
3353
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3354
3355
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3356
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3357
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3358
3359
                        // Shared expert branch
3360
0
                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
3361
3362
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
3363
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
3364
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
3365
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
3366
0
                    }
3367
0
                } break;
3368
0
            case LLM_ARCH_QWEN3:
3369
0
            case LLM_ARCH_QWEN3VL:
3370
0
                {
3371
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3372
3373
                    // output
3374
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3375
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3376
                    // if output is NULL, init from the input tok embed
3377
0
                    if (output == NULL) {
3378
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3379
0
                    }
3380
3381
                    // output rerank head
3382
0
                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3383
3384
0
                    for (int i = 0; i < n_layer; ++i) {
3385
0
                        auto & layer = layers[i];
3386
3387
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3388
3389
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3390
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3391
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3392
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3393
3394
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3395
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3396
3397
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3398
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3399
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3400
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3401
0
                    }
3402
0
                } break;
3403
0
            case LLM_ARCH_QWEN3MOE:
3404
0
            case LLM_ARCH_QWEN3VLMOE:
3405
0
                {
3406
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3407
3408
                    // output
3409
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3410
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3411
                    // if output is NULL, init from the input tok embed
3412
0
                    if (output == NULL) {
3413
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3414
0
                    }
3415
3416
0
                    for (int i = 0; i < n_layer; ++i) {
3417
0
                        auto & layer = layers[i];
3418
3419
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3420
3421
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3422
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3423
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3424
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3425
3426
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3427
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3428
3429
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3430
3431
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3432
3433
0
                        if (n_expert == 0) {
3434
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
3435
0
                        }
3436
0
                        if (n_expert_used == 0) {
3437
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
3438
0
                        }
3439
3440
                        // MoE branch
3441
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3442
3443
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3444
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3445
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3446
0
                    }
3447
0
                } break;
3448
0
            case LLM_ARCH_PHI2:
3449
0
                {
3450
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3451
3452
                    // output
3453
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3454
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3455
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3456
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
3457
3458
0
                    for (int i = 0; i < n_layer; ++i) {
3459
0
                        auto & layer = layers[i];
3460
3461
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3462
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3463
3464
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3465
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3466
3467
0
                        if (layer.wqkv == nullptr) {
3468
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3469
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3470
3471
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3472
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
3473
3474
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3475
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
3476
0
                        }
3477
3478
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3479
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3480
3481
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3482
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3483
3484
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3485
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
3486
0
                    }
3487
0
                } break;
3488
0
            case LLM_ARCH_PHI3:
3489
0
                {
3490
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3491
3492
                    // output
3493
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3494
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3495
3496
                    // if output is NULL, init from the input tok embed
3497
0
                    if (output == NULL) {
3498
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3499
0
                    }
3500
3501
0
                    for (int i = 0; i < n_layer; ++i) {
3502
0
                        auto & layer = layers[i];
3503
3504
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3505
3506
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3507
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3508
3509
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3510
3511
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3512
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
3513
3514
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3515
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3516
0
                    }
3517
0
                } break;
3518
0
            case LLM_ARCH_PHIMOE:
3519
0
                {
3520
0
                    const int64_t n_embd_head = n_embd / n_head;
3521
3522
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3523
3524
                    // output
3525
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3526
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3527
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
3528
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
3529
3530
0
                    for (int i = 0; i < n_layer; ++i) {
3531
0
                        auto & layer = layers[i];
3532
3533
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3534
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
3535
3536
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3537
0
                        if (layer.wqkv == nullptr) {
3538
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3539
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);
3540
3541
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3542
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3543
3544
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3545
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3546
0
                        }
3547
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3548
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
3549
3550
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3551
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
3552
3553
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
3554
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3555
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3556
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3557
3558
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3559
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3560
0
                     }
3561
0
                } break;
3562
0
            case LLM_ARCH_PLAMO:
3563
0
                {
3564
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3565
3566
                    // output
3567
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3568
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3569
3570
0
                    for (int i = 0; i < n_layer; ++i) {
3571
0
                        auto & layer = layers[i];
3572
3573
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3574
3575
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3576
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3577
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3578
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3579
3580
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3581
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3582
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3583
0
                    }
3584
0
                } break;
3585
0
            case LLM_ARCH_PLAMO2:
3586
0
                {
3587
                    // mamba parameters
3588
0
                    const uint32_t d_conv             = hparams.ssm_d_conv;
3589
0
                    const uint32_t d_state            = hparams.ssm_d_state;
3590
0
                    const uint32_t num_heads          = hparams.ssm_dt_rank;
3591
0
                    const uint32_t intermediate_size  = hparams.ssm_d_inner;
3592
0
                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
3593
3594
                    // attention parameters
3595
0
                    const uint32_t qk_dim = hparams.n_embd_head_k;
3596
0
                    const uint32_t v_dim  = hparams.n_embd_head_v;
3597
3598
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3599
3600
                    // output
3601
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3602
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3603
                    // if output is NULL, init from the input tok embed
3604
0
                    if (output == NULL) {
3605
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3606
0
                    }
3607
3608
0
                    for (int i = 0; i < n_layer; ++i) {
3609
0
                        auto & layer = layers[i];
3610
0
                        bool is_mamba_layer = hparams.is_recurrent(i);
3611
3612
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3613
3614
0
                        if (is_mamba_layer) {
3615
0
                            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
3616
0
                            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
3617
3618
0
                            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
3619
0
                            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
3620
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
3621
3622
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
3623
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
3624
3625
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
3626
3627
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
3628
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3629
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3630
0
                        } else {
3631
0
                            const int64_t num_attention_heads = hparams.n_head(i);
3632
0
                            const int64_t q_num_heads         = num_attention_heads;
3633
0
                            const int64_t num_key_value_heads = hparams.n_head_kv(i);
3634
0
                            const int64_t k_num_heads         = num_key_value_heads;
3635
0
                            const int64_t v_num_heads         = num_key_value_heads;
3636
0
                            const int64_t q_proj_dim          = q_num_heads * qk_dim;
3637
0
                            const int64_t k_proj_dim          = k_num_heads * qk_dim;
3638
0
                            const int64_t v_proj_dim          = v_num_heads * v_dim;
3639
3640
0
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3641
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
3642
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
3643
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3644
0
                        }
3645
3646
                        // All layers have post-attention norm, FFN norm, and FFN tensors
3647
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3648
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3649
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3650
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
3651
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3652
0
                    }
3653
0
                } break;
3654
0
            case LLM_ARCH_GPT2:
3655
0
                {
3656
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3657
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
3658
3659
                    // output
3660
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3661
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3662
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3663
3664
                    // if output is NULL, init from the input tok embed
3665
0
                    if (output == NULL) {
3666
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3667
0
                    }
3668
3669
0
                    for (int i = 0; i < n_layer; ++i) {
3670
0
                        auto & layer = layers[i];
3671
3672
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
3673
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
3674
3675
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3676
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3677
3678
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3679
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3680
3681
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3682
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3683
3684
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3685
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3686
3687
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3688
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
3689
0
                    }
3690
0
                } break;
3691
0
            case LLM_ARCH_CODESHELL:
3692
0
                {
3693
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3694
3695
                    // if tok embd is NULL, init from output
3696
0
                    if (tok_embd == NULL) {
3697
0
                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3698
0
                    }
3699
3700
                    // output
3701
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3702
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3703
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3704
3705
0
                    for (int i = 0; i < n_layer; ++i) {
3706
0
                        auto & layer = layers[i];
3707
3708
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3709
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3710
3711
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3712
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3713
3714
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3715
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3716
3717
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3718
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3719
3720
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3721
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3722
3723
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
3724
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
3725
0
                    }
3726
0
                } break;
3727
0
            case LLM_ARCH_ORION:
3728
0
                {
3729
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3730
3731
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3732
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3733
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3734
3735
0
                    for (int i = 0; i < n_layer; ++i) {
3736
0
                        auto & layer = layers[i];
3737
3738
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3739
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3740
3741
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3742
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3743
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3744
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3745
3746
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3747
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3748
3749
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3750
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3751
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3752
0
                    }
3753
0
                } break;
3754
0
            case LLM_ARCH_INTERNLM2:
3755
0
                {
3756
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3757
3758
                    // output
3759
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3760
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3761
3762
0
                    for (int i = 0; i < n_layer; ++i) {
3763
0
                        auto & layer = layers[i];
3764
3765
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3766
                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3767
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3768
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3769
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3770
3771
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3772
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3773
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3774
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3775
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3776
0
                    }
3777
0
                } break;
3778
0
            case LLM_ARCH_GEMMA:
3779
0
                {
3780
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3781
3782
                    // output
3783
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3784
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3785
3786
0
                    for (int i = 0; i < n_layer; ++i) {
3787
0
                        auto & layer = layers[i];
3788
3789
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3790
3791
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3792
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3793
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3794
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3795
3796
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3797
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3798
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3799
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3800
0
                    }
3801
0
                } break;
3802
0
            case LLM_ARCH_GEMMA2:
3803
0
                {
3804
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3805
3806
                    // output
3807
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3808
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3809
3810
0
                    for (int i = 0; i < n_layer; ++i) {
3811
0
                        auto & layer = layers[i];
3812
3813
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3814
3815
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3816
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3817
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3818
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3819
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3820
3821
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3822
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3823
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3824
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3825
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3826
0
                    }
3827
0
                } break;
3828
0
            case LLM_ARCH_GEMMA3:
3829
0
            case LLM_ARCH_GEMMA_EMBEDDING:
3830
0
                {
3831
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3832
3833
                    // output
3834
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3835
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3836
3837
                    // if output is NULL, init from the input tok embed
3838
0
                    if (output == NULL) {
3839
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3840
0
                    }
3841
3842
                    // Dense linear weights
3843
0
                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
3844
0
                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
3845
3846
3847
0
                    for (int i = 0; i < n_layer; ++i) {
3848
0
                        auto & layer = layers[i];
3849
3850
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3851
3852
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3853
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3854
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3855
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3856
3857
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3858
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
3859
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
3860
3861
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3862
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3863
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3864
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3865
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3866
0
                    }
3867
0
                } break;
3868
0
            case LLM_ARCH_GEMMA3N:
3869
0
                {
3870
0
                    const int64_t n_altup      = hparams.n_altup;
3871
0
                    const int64_t laurel_rank  = hparams.laurel_rank;
3872
0
                    const int64_t n_embd_altup = hparams.n_embd_altup;
3873
3874
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3875
                    // if output is NULL, init from the input tok embed
3876
0
                    if (output == NULL) {
3877
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3878
0
                    }
3879
3880
0
                    tok_embd           = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,           "weight"), {n_embd, n_vocab}, 0);
3881
0
                    tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
3882
3883
0
                    altup_proj           = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,           "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3884
0
                    altup_unembd_proj    = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ,    "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3885
0
                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
3886
0
                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight"), {n_embd_altup}, 0);
3887
3888
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3889
3890
0
                    for (int i = 0; i < n_layer; ++i) {
3891
0
                        auto & layer = layers[i];
3892
3893
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3894
3895
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3896
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3897
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3898
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3899
3900
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
3901
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
3902
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3903
3904
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3905
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3906
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3907
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3908
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3909
3910
                        // altup & laurel
3911
0
                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
3912
0
                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
3913
0
                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
3914
0
                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
3915
0
                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
3916
0
                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
3917
0
                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
3918
0
                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
3919
0
                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
3920
0
                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
3921
0
                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
3922
0
                    }
3923
0
                } break;
3924
0
            case LLM_ARCH_STARCODER2:
3925
0
                {
3926
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3927
3928
                    // output
3929
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3930
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3931
3932
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3933
                    // if output is NULL, init from the input tok embed
3934
0
                    if (output == NULL) {
3935
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3936
0
                    }
3937
3938
0
                    for (int i = 0; i < n_layer; ++i) {
3939
0
                        auto & layer = layers[i];
3940
3941
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3942
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3943
3944
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3945
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3946
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3947
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3948
3949
                        // optional bias tensors
3950
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
3951
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
3952
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
3953
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3954
3955
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3956
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3957
3958
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3959
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3960
3961
                        // optional bias tensors
3962
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3963
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
3964
0
                    }
3965
0
                } break;
3966
0
            case LLM_ARCH_MAMBA:
3967
0
                {
3968
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
3969
0
                    const int64_t d_inner = hparams.ssm_d_inner;
3970
0
                    const int64_t d_state = hparams.ssm_d_state;
3971
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
3972
3973
                    // only an expansion factor of 2 is supported for now
3974
0
                    if (2 * n_embd != d_inner) {
3975
0
                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
3976
0
                    }
3977
3978
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3979
3980
                    // output
3981
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3982
3983
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3984
                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
3985
0
                    if (output == NULL) {
3986
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3987
0
                    }
3988
3989
0
                    for (int i = 0; i < n_layer; ++i) {
3990
0
                        auto & layer = layers[i];
3991
3992
                        // norm
3993
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3994
3995
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
3996
3997
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
3998
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
3999
4000
0
                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4001
4002
0
                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4003
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4004
4005
                        // no "weight" suffix for these
4006
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4007
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4008
4009
                        // out_proj
4010
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4011
0
                    }
4012
0
                } break;
4013
0
            case LLM_ARCH_MAMBA2:
4014
0
                {
4015
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4016
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4017
0
                    const int64_t d_state = hparams.ssm_d_state;
4018
0
                    const int64_t n_head  = hparams.ssm_dt_rank;
4019
0
                    const int64_t n_group = hparams.ssm_n_group;
4020
0
                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
4021
4022
                    // only an expansion factor of 2 is supported for now
4023
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4024
4025
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4026
4027
                    // output
4028
0
                    {
4029
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4030
4031
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4032
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4033
0
                        if (output == NULL) {
4034
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4035
0
                        }
4036
0
                    }
4037
4038
0
                    for (int i = 0; i < n_layer; ++i) {
4039
0
                        auto & layer = layers[i];
4040
4041
                        // norm
4042
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4043
4044
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4045
4046
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4047
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4048
4049
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4050
4051
                        // no "weight" suffix for these
4052
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4053
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4054
4055
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4056
4057
                        // out_proj
4058
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4059
0
                    }
4060
0
                } break;
4061
0
            case LLM_ARCH_JAMBA:
4062
0
                {
4063
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4064
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4065
0
                    const int64_t d_state = hparams.ssm_d_state;
4066
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
4067
4068
                    // only an expansion factor of 2 is supported for now
4069
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4070
4071
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4072
4073
                    // output
4074
0
                    {
4075
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4076
4077
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4078
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4079
0
                        if (output == NULL) {
4080
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4081
0
                        }
4082
0
                    }
4083
4084
0
                    for (int i = 0; i < n_layer; ++i) {
4085
0
                        const int64_t n_head_kv = hparams.n_head_kv(i);
4086
0
                        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
4087
4088
0
                        auto & layer = layers[i];
4089
4090
                        // norm
4091
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4092
4093
0
                        if (n_head_kv == 0) {
4094
                            // Mamba layer
4095
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4096
4097
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4098
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4099
4100
0
                            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4101
4102
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4103
4104
0
                            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4105
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4106
4107
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4108
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4109
4110
                            // no "weight" suffix for these
4111
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4112
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4113
4114
                            // out_proj
4115
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4116
0
                        } else {
4117
                            // Attention layers
4118
4119
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4120
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4121
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4122
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4123
0
                        }
4124
4125
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4126
4127
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4128
4129
0
                        if (layer.ffn_gate_inp) {
4130
                            // MoE
4131
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4132
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4133
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
4134
0
                        } else {
4135
                            // FFN (no MoE)
4136
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4137
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4138
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4139
0
                        }
4140
0
                    }
4141
0
                } break;
4142
0
            case LLM_ARCH_GRANITE_HYBRID:
4143
0
                {
4144
                    // mamba2 Mixer SSM params
4145
                    // NOTE: int64_t for tensor dimensions
4146
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
4147
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
4148
0
                    const int64_t d_state    = hparams.ssm_d_state;
4149
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
4150
0
                    const int64_t n_group    = hparams.ssm_n_group;
4151
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4152
4153
                    // only an expansion factor of 2 is supported for now
4154
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4155
4156
                    // embeddings
4157
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4158
4159
                    // output
4160
0
                    {
4161
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4162
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4163
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4164
0
                        if (output == NULL) {
4165
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4166
0
                        }
4167
0
                    }
4168
4169
0
                    for (int i = 0; i < n_layer; ++i) {
4170
0
                        auto & layer = layers[i];
4171
4172
                        // norm
4173
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4174
4175
0
                        if (hparams.is_recurrent(i)) {
4176
                            // ssm layers
4177
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4178
4179
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4180
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4181
4182
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4183
4184
                            // no "weight" suffix for these
4185
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4186
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4187
4188
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4189
4190
                            // out_proj
4191
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4192
0
                        } else {
4193
                            // attention layers (with optional bias)
4194
0
                            const int64_t n_head_i = hparams.n_head(i);
4195
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4196
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4197
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4198
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4199
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4200
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4201
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4202
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4203
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4204
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4205
0
                        }
4206
4207
                        // feed forward (w/ optional biases)
4208
0
                        if (n_expert > 0) {
4209
                            // MoE FFN
4210
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4211
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4212
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
4213
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4214
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4215
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4216
4217
                            // For Granite MoE Shared
4218
0
                            if (hparams.n_ff_shexp > 0) {
4219
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4220
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4221
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4222
0
                            }
4223
0
                        } else {
4224
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4225
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4226
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4227
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4228
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4229
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4230
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4231
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4232
0
                        }
4233
0
                    }
4234
0
                } break;
4235
0
            case LLM_ARCH_XVERSE:
4236
0
                {
4237
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4238
4239
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4240
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4241
4242
0
                    for (int i = 0; i < n_layer; ++i) {
4243
0
                        auto & layer = layers[i];
4244
4245
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4246
4247
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4248
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4249
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4250
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4251
4252
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4253
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4254
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4255
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4256
0
                    }
4257
0
                } break;
4258
0
            case LLM_ARCH_COMMAND_R:
4259
0
                {
4260
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4261
4262
                    // output
4263
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4264
                    // init output from the input tok embed
4265
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4266
4267
0
                    for (int i = 0; i < n_layer; ++i) {
4268
0
                        auto & layer = layers[i];
4269
4270
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4271
4272
0
                        if (n_layer >= 64){
4273
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
4274
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
4275
0
                        }
4276
4277
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4278
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4279
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4280
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4281
4282
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4283
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4284
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4285
0
                    }
4286
0
                } break;
4287
0
            case LLM_ARCH_COHERE2:
4288
0
                {
4289
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4290
4291
                    // output
4292
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4293
                    // init output from the input tok embed
4294
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
4295
0
                                                      TENSOR_DUPLICATED);
4296
4297
0
                    for (int i = 0; i < n_layer; ++i) {
4298
0
                        auto & layer = layers[i];
4299
4300
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4301
4302
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
4303
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
4304
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
4305
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4306
4307
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
4308
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4309
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
4310
0
                    }
4311
0
                }
4312
0
                break;
4313
0
            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
4314
0
                {
4315
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4316
4317
                    // output
4318
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4319
                    // if output is NULL, init from the input tok embed
4320
0
                    if (output == NULL) {
4321
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4322
0
                    }
4323
4324
0
                    for (int i = 0; i < n_layer; ++i) {
4325
0
                        auto & layer = layers[i];
4326
4327
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4328
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4329
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4330
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4331
4332
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4333
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4334
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4335
0
                    }
4336
0
                } break;
4337
0
            case LLM_ARCH_OLMO2:
4338
0
                {
4339
0
                    const int64_t n_embd_head = n_embd / n_head;
4340
4341
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4342
4343
                    // output
4344
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4345
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4346
4347
0
                    for (int i = 0; i < n_layer; ++i) {
4348
0
                        auto & layer = layers[i];
4349
4350
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4351
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4352
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4353
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4354
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4355
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
4356
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4357
4358
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4359
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4360
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4361
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4362
0
                    }
4363
0
                } break;
4364
0
            case LLM_ARCH_SEED_OSS:
4365
0
                {
4366
0
                    const uint32_t head_dim             = hparams.n_embd_head_k;
4367
0
                    const int64_t n_qo_dim              = n_head * head_dim;
4368
0
                    const int64_t n_kv_dim              = n_head_kv * head_dim;
4369
4370
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4371
4372
                    // output
4373
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4374
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4375
                    // if output is NULL, init from the input tok embed
4376
0
                    if (output == NULL) {
4377
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4378
0
                    }
4379
4380
0
                    for (int i = 0; i < n_layer; ++i) {
4381
0
                        auto & layer = layers[i];
4382
4383
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, 0);
4384
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, 0);
4385
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, 0);
4386
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4387
4388
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_qo_dim},   TENSOR_NOT_REQUIRED);
4389
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4390
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4391
4392
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4393
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4394
4395
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4396
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4397
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4398
0
                    }
4399
0
                } break;
4400
4401
0
            case LLM_ARCH_OLMOE:
4402
0
                {
4403
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4404
4405
                    // output
4406
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4407
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4408
4409
0
                    for (int i = 0; i < n_layer; ++i) {
4410
0
                        auto & layer = layers[i];
4411
4412
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4413
4414
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4415
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4416
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4417
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4418
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4419
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
4420
4421
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4422
4423
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4424
4425
0
                        if (n_expert == 0) {
4426
0
                            throw std::runtime_error("n_expert must be > 0");
4427
0
                        }
4428
0
                        if (n_expert_used == 0) {
4429
0
                            throw std::runtime_error("n_expert_used must be > 0");
4430
0
                        }
4431
4432
                        // MoE branch
4433
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
4434
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
4435
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
4436
0
                    }
4437
0
                } break;
4438
0
            case LLM_ARCH_OPENELM:
4439
0
                {
4440
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4441
4442
                    // output
4443
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4444
                    // init output from the input tok embed
4445
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4446
4447
0
                    for (int i = 0; i < n_layer; ++i) {
4448
0
                        const int64_t n_head      =   hparams.n_head(i);
4449
0
                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
4450
0
                        const int64_t n_ff        =   hparams.n_ff(i);
4451
4452
0
                        auto & layer = layers[i];
4453
4454
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4455
4456
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
4457
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4458
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4459
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
4460
4461
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4462
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4463
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4464
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4465
0
                    }
4466
0
                } break;
4467
0
            case LLM_ARCH_GPTNEOX:
4468
0
                {
4469
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4470
4471
                    // output
4472
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4473
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4474
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4475
4476
0
                    for (int i = 0; i < n_layer; ++i) {
4477
0
                        auto & layer = layers[i];
4478
4479
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4480
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4481
4482
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4483
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4484
4485
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4486
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4487
4488
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4489
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4490
4491
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4492
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4493
4494
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4495
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4496
0
                    }
4497
0
                } break;
4498
0
            case LLM_ARCH_ARCTIC:
4499
0
                {
4500
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4501
4502
                    // output
4503
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4504
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4505
4506
                    // if output is NULL, init from the input tok embed
4507
0
                    if (output == NULL) {
4508
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4509
0
                    }
4510
4511
0
                    for (int i = 0; i < n_layer; ++i) {
4512
0
                        auto & layer = layers[i];
4513
4514
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4515
4516
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4517
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4518
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4519
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4520
4521
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4522
4523
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
4524
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
4525
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
4526
4527
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4528
0
                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
4529
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
4530
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4531
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4532
0
                    }
4533
0
                } break;
4534
0
            case LLM_ARCH_DEEPSEEK:
4535
0
                {
4536
4537
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
4538
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
4539
4540
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4541
4542
                    // output
4543
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4544
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4545
4546
0
                    for (int i = 0; i < n_layer; ++i) {
4547
0
                        auto & layer = layers[i];
4548
4549
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4550
4551
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4552
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4553
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4554
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4555
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4556
4557
0
                        if (i < (int) hparams.n_layer_dense_lead) {
4558
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4559
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4560
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4561
0
                        } else {
4562
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4563
4564
0
                            if (n_expert == 0) {
4565
0
                                throw std::runtime_error("n_expert must be > 0");
4566
0
                            }
4567
0
                            if (n_expert_used == 0) {
4568
0
                                throw std::runtime_error("n_expert_used must be > 0");
4569
0
                            }
4570
4571
                            // MoE branch
4572
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4573
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
4574
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4575
4576
                            // Shared expert branch
4577
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4578
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
4579
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4580
0
                        }
4581
0
                    }
4582
0
                } break;
4583
0
            case LLM_ARCH_DEEPSEEK2:
4584
0
                {
4585
                    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
4586
0
                    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
4587
4588
0
                    const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
4589
4590
                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
4591
0
                    const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
4592
0
                    const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
4593
4594
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
4595
0
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
4596
4597
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
4598
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
4599
4600
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
4601
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
4602
4603
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4604
4605
                    // output
4606
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4607
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4608
4609
0
                    for (int i = 0; i < n_layer; ++i) {
4610
0
                        auto & layer = layers[i];
4611
4612
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4613
0
                        if (!is_lite) {
4614
0
                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
4615
0
                        }
4616
4617
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4618
4619
0
                        if (!is_lite) {
4620
0
                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
4621
0
                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
4622
0
                        } else {
4623
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
4624
0
                        }
4625
4626
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
4627
4628
                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
4629
0
                        if (is_mla) {
4630
0
                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
4631
0
                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
4632
0
                        } else {
4633
0
                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
4634
0
                        }
4635
4636
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
4637
4638
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4639
4640
0
                        if (i < (int) hparams.n_layer_dense_lead) {
4641
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4642
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4643
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4644
0
                        } else {
4645
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4646
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4647
4648
0
                            if (n_expert == 0) {
4649
0
                                throw std::runtime_error("n_expert must be > 0");
4650
0
                            }
4651
0
                            if (n_expert_used == 0) {
4652
0
                                throw std::runtime_error("n_expert_used must be > 0");
4653
0
                            }
4654
4655
                            // MoE branch
4656
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4657
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
4658
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4659
4660
                            // Shared expert branch
4661
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4662
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
4663
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4664
0
                        }
4665
0
                    }
4666
0
                } break;
4667
0
            case LLM_ARCH_PLM:
4668
0
                {
4669
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
4670
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
4671
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
4672
4673
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4674
4675
                    // output
4676
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4677
                    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4678
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4679
4680
0
                    for (int i = 0; i < n_layer; ++i) {
4681
0
                        auto & layer = layers[i];
4682
4683
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4684
4685
0
                        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4686
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
4687
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4688
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
4689
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
4690
4691
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4692
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4693
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4694
0
                    }
4695
0
                } break;
4696
0
            case LLM_ARCH_BITNET:
4697
0
                {
4698
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4699
4700
                    // output
4701
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4702
4703
0
                    for (int i = 0; i < n_layer; ++i) {
4704
0
                        auto & layer = layers[i];
4705
4706
0
                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
4707
0
                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
4708
4709
0
                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4710
0
                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4711
0
                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4712
0
                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4713
0
                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4714
0
                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4715
0
                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4716
0
                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4717
4718
0
                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
4719
0
                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
4720
4721
0
                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4722
0
                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4723
0
                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4724
0
                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4725
0
                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4726
0
                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4727
0
                    }
4728
0
                } break;
4729
0
            case LLM_ARCH_T5:
4730
0
                {
4731
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4732
4733
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4734
4735
                    // output
4736
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4737
0
                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4738
4739
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4740
                    // if output is NULL, init from the input tok embed
4741
0
                    if (output == NULL) {
4742
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4743
0
                    }
4744
4745
                    // n_layer:     number of encoder_layers
4746
                    // dec_n_layer: number of decoder_layers
4747
0
                    const int dec_n_layer = hparams.dec_n_layer;
4748
0
                    if (dec_n_layer > n_layer) {
4749
0
                        layers.resize(dec_n_layer);
4750
0
                    }
4751
4752
                    // load encoder layers
4753
0
                    for (int i = 0; i < n_layer; ++i) {
4754
0
                        auto & layer = layers[i];
4755
4756
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4757
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4758
4759
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4760
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4761
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4762
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4763
4764
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4765
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4766
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4767
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4768
0
                    }
4769
4770
                    // load decoder layers
4771
0
                    for (int i = 0; i < dec_n_layer; ++i) {
4772
0
                        auto & layer = layers[i];
4773
4774
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4775
0
                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4776
4777
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4778
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4779
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4780
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4781
4782
0
                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
4783
                        // this tensor seems to be unused in HF transformers implementation
4784
0
                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4785
4786
0
                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4787
0
                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4788
0
                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4789
0
                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4790
4791
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
4792
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4793
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4794
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4795
0
                    }
4796
0
                } break;
4797
0
            case LLM_ARCH_T5ENCODER:
4798
0
                {
4799
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4800
4801
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4802
4803
                    // output
4804
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4805
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4806
                    // if output is NULL, init from the input tok embed
4807
0
                    if (output == NULL) {
4808
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4809
0
                    }
4810
4811
0
                    for (int i = 0; i < n_layer; ++i) {
4812
0
                        auto & layer = layers[i];
4813
4814
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4815
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4816
4817
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4818
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4819
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4820
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4821
4822
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4823
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4824
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4825
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4826
0
                    }
4827
0
                } break;
4828
0
            case LLM_ARCH_JAIS:
4829
0
                {
4830
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4831
4832
                    // output
4833
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4834
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4835
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4836
4837
0
                    for (int i = 0; i < n_layer; ++i) {
4838
0
                        auto & layer = layers[i];
4839
4840
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
4841
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
4842
4843
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4844
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4845
4846
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4847
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4848
4849
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4850
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4851
4852
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4853
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4854
4855
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
4856
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
4857
4858
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4859
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4860
0
                    }
4861
0
                } break;
4862
0
            case LLM_ARCH_CHATGLM:
4863
0
                {
4864
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
4865
4866
                    // output
4867
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4868
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4869
                    // if output is NULL, init from the input tok embed
4870
0
                    if (output == NULL) {
4871
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4872
0
                    }
4873
4874
0
                    for (int i = 0; i < n_layer; ++i) {
4875
0
                        auto & layer = layers[i];
4876
4877
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4878
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4879
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4880
4881
0
                        if (layer.wqkv == nullptr) {
4882
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4883
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4884
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4885
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4886
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4887
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4888
0
                        }
4889
4890
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4891
4892
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4893
4894
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
4895
4896
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4897
0
                    }
4898
0
                } break;
4899
0
            case LLM_ARCH_GLM4:
4900
0
                {
4901
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4902
4903
                    // output
4904
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4905
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4906
                    // if output is NULL, init from the input tok embed
4907
0
                    if (output == NULL) {
4908
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4909
0
                    }
4910
4911
0
                    for (int i = 0; i < n_layer; ++i) {
4912
0
                        auto & layer = layers[i];
4913
4914
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4915
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4916
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4917
4918
0
                        if (layer.wqkv == nullptr) {
4919
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4920
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4921
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4922
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4923
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4924
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4925
0
                        }
4926
4927
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4928
4929
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4930
4931
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4932
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4933
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
4934
4935
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4936
0
                    }
4937
0
                } break;
4938
0
            case LLM_ARCH_GLM4_MOE:
4939
0
                {
4940
0
                    const int64_t n_expert        = hparams.n_expert;
4941
0
                    const int64_t n_expert_used   = hparams.n_expert_used;
4942
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
4943
4944
0
                    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4945
0
                    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4946
4947
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4948
4949
                    // output
4950
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4951
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4952
                    // if output is NULL, init from the input tok embed
4953
0
                    if (output == NULL) {
4954
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4955
0
                    }
4956
4957
                    // Load ALL tensors including NextN layer to satisfy total tensor count
4958
                    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4959
0
                    for (int i = 0; i < n_layer; ++i) {
4960
0
                        int flags = 0;
4961
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4962
                            // skip all tensors in the NextN layers
4963
0
                            flags |= TENSOR_SKIP;
4964
0
                        }
4965
4966
0
                        auto & layer = layers[i];
4967
4968
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4969
4970
                        // GLM-style attention with bias terms
4971
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4972
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4973
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4974
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4975
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4976
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4977
4978
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4979
4980
                        // K/Q norm tensors (optional for GLM-4.5 355B variant)
4981
0
                        layer.attn_q_norm = create_tensor(
4982
0
                            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4983
0
                        layer.attn_k_norm = create_tensor(
4984
0
                            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4985
4986
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4987
4988
                        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4989
                        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4990
0
                        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4991
4992
0
                        if (use_moe) {
4993
                            // MoE layers
4994
0
                            layer.ffn_gate_inp =
4995
0
                                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4996
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4997
4998
                            // MoE branch
4999
0
                            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5000
5001
0
                            layer.ffn_gate_exps = create_tensor(
5002
0
                                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5003
0
                            layer.ffn_down_exps = create_tensor(
5004
0
                                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
5005
0
                            layer.ffn_up_exps = create_tensor(
5006
0
                                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5007
5008
                            // Shared expert
5009
0
                            if (n_expert_shared > 0) {
5010
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5011
0
                                layer.ffn_gate_shexp = create_tensor(
5012
0
                                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5013
0
                                layer.ffn_down_shexp = create_tensor(
5014
0
                                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
5015
0
                                layer.ffn_up_shexp = create_tensor(
5016
0
                                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5017
0
                            }
5018
0
                        } else {
5019
                            // Dense layers (first k layers) - GLM uses separate gate/up projections
5020
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
5021
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
5022
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
5023
0
                        }
5024
5025
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5026
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5027
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5028
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5029
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5030
5031
                            // Optional tensors
5032
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5033
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5034
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5035
0
                        }
5036
0
                    }
5037
0
                }
5038
0
                break;
5039
0
            case LLM_ARCH_NEMOTRON:
5040
0
                {
5041
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5042
5043
                    // output
5044
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5045
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5046
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5047
5048
0
                    for (int i = 0; i < n_layer; ++i) {
5049
0
                        auto & layer = layers[i];
5050
5051
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5052
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5053
5054
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5055
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5056
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5057
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5058
5059
                        // optional bias tensors
5060
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5061
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5062
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5063
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5064
5065
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5066
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5067
5068
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5069
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5070
5071
                        // optional MLP bias
5072
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5073
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5074
0
                    }
5075
0
                } break;
5076
0
            case LLM_ARCH_NEMOTRON_H:
5077
0
                {
5078
                    // mamba2 Mixer SSM params
5079
                    // NOTE: int64_t for tensor dimensions
5080
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
5081
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
5082
0
                    const int64_t d_state    = hparams.ssm_d_state;
5083
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
5084
0
                    const int64_t n_group    = hparams.ssm_n_group;
5085
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5086
5087
                    // embeddings
5088
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5089
5090
                    // output
5091
0
                    {
5092
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5093
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5094
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
5095
0
                        if (output == NULL) {
5096
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5097
0
                        }
5098
0
                    }
5099
5100
0
                    for (int i = 0; i < n_layer; ++i) {
5101
0
                        auto & layer = layers[i];
5102
5103
                        // all blocks use the attn norm
5104
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5105
5106
0
                        if (hparams.is_recurrent(i)) {
5107
                            // ssm layers
5108
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
5109
5110
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
5111
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
5112
5113
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
5114
5115
                            // no "weight" suffix for these
5116
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
5117
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
5118
5119
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
5120
5121
                            // out_proj
5122
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
5123
0
                        } else if (hparams.n_ff(i) == 0) {
5124
                            // attention layers (with optional bias)
5125
0
                            const int64_t n_head_i = hparams.n_head(i);
5126
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
5127
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
5128
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
5129
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
5130
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
5131
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
5132
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5133
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias",   i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5134
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias",   i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5135
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5136
0
                        } else {
5137
                            // mlp layers
5138
0
                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
5139
0
                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
5140
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
5141
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5142
0
                        }
5143
0
                    }
5144
0
                } break;
5145
0
            case LLM_ARCH_EXAONE:
5146
0
                {
5147
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5148
5149
                    // output
5150
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5151
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5152
5153
                    // if output is NULL, init from the input tok embed
5154
0
                    if (output == NULL) {
5155
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5156
0
                    }
5157
5158
0
                    for (int i = 0; i < n_layer; ++i) {
5159
0
                        auto & layer = layers[i];
5160
5161
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5162
5163
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5164
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5165
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5166
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5167
5168
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
5169
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5170
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
5171
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
5172
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
5173
0
                    }
5174
0
                } break;
5175
0
            case LLM_ARCH_EXAONE4:
5176
0
                {
5177
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5178
5179
                    // output
5180
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5181
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5182
5183
                    // if output is NULL, init from the input tok embed
5184
0
                    if (output == NULL) {
5185
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5186
0
                    }
5187
5188
0
                    for (int i = 0; i < n_layer; ++i) {
5189
0
                        auto & layer = layers[i];
5190
5191
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5192
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5193
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5194
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5195
5196
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5197
5198
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5199
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5200
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5201
5202
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5203
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5204
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5205
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5206
0
                    }
5207
0
                } break;
5208
0
            case LLM_ARCH_RWKV6:
5209
0
                {
5210
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5211
5212
                    // Block 0, LN0
5213
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5214
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5215
5216
                    // output
5217
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5218
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5219
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5220
5221
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5222
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5223
0
                    const int head_size = hparams.wkv_head_size;
5224
0
                    const int attn_hidden_size = n_embd;
5225
0
                    const int ffn_size = hparams.n_ff_arr[0];
5226
5227
0
                    for (int i = 0; i < n_layer; ++i) {
5228
0
                        auto & layer = layers[i];
5229
5230
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5231
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5232
5233
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5234
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5235
5236
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5237
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5238
5239
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5240
0
                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5241
0
                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5242
0
                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5243
0
                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5244
0
                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5245
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
5246
0
                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
5247
5248
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
5249
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5250
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5251
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5252
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5253
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5254
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5255
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5256
5257
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5258
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5259
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5260
5261
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5262
0
                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
5263
5264
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5265
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5266
0
                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
5267
0
                    }
5268
5269
0
                } break;
5270
0
            case LLM_ARCH_RWKV6QWEN2:
5271
0
                {
5272
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5273
5274
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5275
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
5276
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5277
5278
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5279
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5280
0
                    const int head_size = hparams.wkv_head_size;
5281
0
                    const int attn_hidden_size = n_embd;
5282
0
                    const int n_head_kv = hparams.n_head_kv();
5283
0
                    int attn_key_value_size;
5284
0
                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
5285
0
                        attn_key_value_size = attn_hidden_size;
5286
0
                    } else {
5287
0
                        attn_key_value_size = n_head_kv * head_size;
5288
0
                    }
5289
5290
0
                    for (int i = 0; i < n_layer; ++i) {
5291
0
                        auto & layer = layers[i];
5292
5293
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5294
5295
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5296
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5297
5298
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5299
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5300
5301
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
5302
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5303
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5304
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5305
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
5306
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
5307
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5308
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5309
                        // optional bias tensors
5310
0
                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5311
0
                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5312
0
                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
5313
5314
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5315
5316
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5317
5318
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5319
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5320
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5321
0
                    }
5322
0
                } break;
5323
0
            case LLM_ARCH_RWKV7:
5324
0
                {
5325
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5326
5327
                    // Block 0, LN0
5328
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5329
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5330
5331
                    // output
5332
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5333
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5334
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5335
5336
0
                    const int n_lora_decay = hparams.n_lora_decay;
5337
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
5338
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5339
0
                    const int n_lora_gate = hparams.n_lora_gate;
5340
0
                    const int attn_hidden_size = n_embd;
5341
0
                    const int ffn_size = hparams.n_ff_arr[0];
5342
5343
0
                    for (int i = 0; i < n_layer; ++i) {
5344
0
                        auto & layer = layers[i];
5345
5346
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5347
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5348
5349
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5350
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5351
5352
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5353
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5354
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5355
5356
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5357
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5358
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5359
5360
0
                        if (i == 0) {
5361
                            // actually not used
5362
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5363
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5364
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5365
0
                        } else {
5366
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5367
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5368
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5369
0
                        }
5370
5371
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
5372
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
5373
5374
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5375
5376
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5377
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5378
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5379
5380
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5381
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5382
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5383
5384
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5385
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5386
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5387
5388
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5389
5390
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5391
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5392
0
                    }
5393
5394
0
                } break;
5395
0
            case LLM_ARCH_ARWKV7:
5396
0
                {
5397
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5398
5399
                    // output
5400
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5401
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5402
5403
0
                    const int n_lora_decay = hparams.n_lora_decay;
5404
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
5405
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5406
0
                    const int n_lora_gate = hparams.n_lora_gate;
5407
0
                    const int attn_hidden_size = n_embd;
5408
5409
0
                    for (int i = 0; i < n_layer; ++i) {
5410
0
                        auto & layer = layers[i];
5411
5412
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5413
5414
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5415
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5416
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5417
5418
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5419
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5420
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5421
5422
0
                        if (i == 0) {
5423
                            // actually not used
5424
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5425
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5426
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5427
0
                        } else {
5428
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5429
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5430
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5431
0
                        }
5432
5433
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
5434
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
5435
5436
0
                        try {
5437
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5438
0
                        } catch(std::runtime_error & e) {
5439
                            // ARWKV models may not have gate tensors
5440
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5441
0
                        }
5442
5443
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5444
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5445
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5446
5447
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5448
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5449
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5450
5451
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
5452
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5453
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5454
5455
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5456
5457
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5458
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5459
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5460
0
                    }
5461
5462
0
                } break;
5463
0
            case LLM_ARCH_CHAMELEON:
5464
0
                {
5465
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5466
5467
                    // output
5468
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5469
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5470
                    // if output is NULL, init from the input tok embed
5471
0
                    if (output == NULL) {
5472
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5473
0
                    }
5474
5475
0
                    for (int i = 0; i < n_layer; ++i) {
5476
0
                        auto & layer = layers[i];
5477
5478
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5479
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
5480
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
5481
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
5482
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
5483
5484
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5485
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5486
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5487
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5488
5489
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5490
5491
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5492
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5493
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5494
0
                    }
5495
0
                } break;
5496
0
            case LLM_ARCH_WAVTOKENIZER_DEC:
5497
0
                {
5498
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
5499
5500
0
                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
5501
0
                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
5502
5503
                    // posnet
5504
0
                    {
5505
0
                        const int64_t n_embd = hparams.posnet.n_embd;
5506
5507
0
                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
5508
0
                            auto & layer = layers[i].posnet;
5509
5510
                            // posnet:
5511
                            //
5512
                            //  - resnet
5513
                            //  - resnet
5514
                            //  - attn
5515
                            //  - resnet
5516
                            //  - resnet
5517
                            //  - norm
5518
                            //
5519
0
                            switch (i) {
5520
0
                                case 0:
5521
0
                                case 1:
5522
0
                                case 3:
5523
0
                                case 4:
5524
0
                                    {
5525
0
                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
5526
0
                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
5527
5528
0
                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
5529
0
                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
5530
5531
0
                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
5532
0
                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
5533
5534
0
                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
5535
0
                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
5536
0
                                    } break;
5537
0
                                case 2:
5538
0
                                    {
5539
0
                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5540
0
                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
5541
5542
0
                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
5543
0
                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
5544
5545
0
                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
5546
0
                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
5547
5548
0
                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
5549
0
                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
5550
5551
0
                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
5552
0
                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
5553
0
                                    } break;
5554
0
                                case 5:
5555
0
                                    {
5556
0
                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5557
0
                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
5558
0
                                    } break;
5559
0
                                default: GGML_ABORT("unknown posnet layer");
5560
0
                            };
5561
0
                        }
5562
0
                    }
5563
5564
0
                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
5565
5566
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
5567
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
5568
5569
                    // convnext
5570
0
                    {
5571
0
                        const int64_t n_embd = hparams.convnext.n_embd;
5572
5573
0
                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
5574
0
                            auto & layer = layers[i].convnext;
5575
5576
0
                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
5577
0
                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
5578
5579
0
                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
5580
0
                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
5581
5582
0
                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
5583
0
                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
5584
5585
0
                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
5586
0
                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
5587
5588
0
                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
5589
0
                        }
5590
5591
                        // output
5592
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5593
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5594
0
                    }
5595
5596
0
                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
5597
0
                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
5598
0
                } break;
5599
0
            case LLM_ARCH_BAILINGMOE:
5600
0
                {
5601
0
                    const int64_t n_ff_exp            = hparams.n_ff_exp;
5602
0
                    const int64_t n_expert_shared     = hparams.n_expert_shared;
5603
5604
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5605
5606
                    // output
5607
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5608
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5609
5610
0
                    for (int i = 0; i < n_layer; ++i) {
5611
0
                        auto & layer = layers[i];
5612
5613
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5614
5615
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
5616
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5617
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5618
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5619
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5620
5621
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5622
5623
0
                        if (n_expert == 0) {
5624
0
                            throw std::runtime_error("n_expert must be > 0");
5625
0
                        }
5626
0
                        if (n_expert_used == 0) {
5627
0
                            throw std::runtime_error("n_expert_used must be > 0");
5628
0
                        }
5629
5630
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5631
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5632
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5633
5634
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5635
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5636
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5637
0
                    }
5638
0
                } break;
5639
0
            case LLM_ARCH_BAILINGMOE2:
5640
0
                {
5641
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5642
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5643
5644
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5645
5646
                    // output
5647
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5648
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5649
5650
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
5651
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
5652
5653
0
                    for (int i = 0; i < n_layer; ++i) {
5654
0
                        int flags = 0;
5655
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5656
                            // skip all tensors in the NextN layers
5657
0
                            flags |= TENSOR_SKIP;
5658
0
                        }
5659
5660
0
                        auto & layer = layers[i];
5661
5662
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5663
5664
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
5665
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5666
5667
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5668
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5669
5670
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5671
5672
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5673
0
                            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
5674
5675
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5676
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5677
5678
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
5679
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
5680
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
5681
5682
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5683
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5684
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
5685
0
                        } else { // Dense layers
5686
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
5687
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
5688
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
5689
0
                        }
5690
5691
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5692
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5693
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5694
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5695
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5696
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5697
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5698
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
5699
0
                            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
5700
0
                        }
5701
0
                    }
5702
0
                } break;
5703
0
            case LLM_ARCH_DOTS1:
5704
0
                {
5705
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5706
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5707
5708
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5709
5710
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5711
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5712
5713
0
                    for (int i = 0; i < n_layer; ++i) {
5714
0
                        auto & layer = layers[i];
5715
5716
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5717
5718
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5719
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5720
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5721
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5722
5723
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5724
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5725
5726
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5727
5728
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5729
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5730
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5731
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5732
0
                        } else {
5733
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5734
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5735
5736
0
                            if (n_expert == 0) {
5737
0
                                throw std::runtime_error("n_expert must be > 0");
5738
0
                            }
5739
0
                            if (n_expert_used == 0) {
5740
0
                                throw std::runtime_error("n_expert_used must be > 0");
5741
0
                            }
5742
5743
                            // MoE branch
5744
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5745
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5746
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5747
5748
                            // Shared expert branch
5749
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5750
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5751
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5752
0
                        }
5753
0
                    }
5754
0
                } break;
5755
0
            case LLM_ARCH_ARCEE:
5756
0
                {
5757
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5758
5759
                    // output
5760
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5761
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5762
5763
                    // if output is NULL, init from the input tok embed
5764
0
                    if (output == NULL) {
5765
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5766
0
                    }
5767
5768
0
                    for (int i = 0; i < n_layer; ++i) {
5769
0
                        auto & layer = layers[i];
5770
5771
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5772
5773
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5774
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5775
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5776
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5777
5778
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5779
5780
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5781
5782
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5783
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5784
0
                    }
5785
0
                } break;
5786
0
            case LLM_ARCH_AFMOE:
5787
0
                {
5788
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5789
5790
                    // output
5791
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5792
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5793
5794
                    // if output is NULL, init from the input tok embed
5795
0
                    if (output == NULL) {
5796
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5797
0
                    }
5798
5799
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
5800
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5801
5802
0
                    for (int i = 0; i < n_layer; ++i) {
5803
0
                        auto & layer = layers[i];
5804
5805
                        // dual attention normalization
5806
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
5807
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5808
5809
                        // attention projections
5810
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5811
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5812
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5813
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5814
5815
                        // Q/K normalization
5816
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5817
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5818
5819
                        // attention gating
5820
0
                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5821
5822
                        // dual ffn normalization
5823
0
                        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
5824
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5825
5826
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
5827
                            // MoE layers
5828
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5829
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
5830
5831
                            // grouped expert weights
5832
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5833
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5834
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5835
5836
                            // shared expert
5837
0
                            if (n_expert_shared > 0) {
5838
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5839
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5840
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5841
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
5842
0
                            }
5843
0
                        } else {
5844
                            // Dense layers
5845
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5846
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5847
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5848
0
                        }
5849
0
                    }
5850
0
                } break;
5851
0
            case LLM_ARCH_ERNIE4_5:
5852
0
            case LLM_ARCH_ERNIE4_5_MOE:
5853
0
                {
5854
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5855
5856
                    // output
5857
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5858
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5859
                    // if output is NULL, init from the input tok embed
5860
0
                    if (output == NULL) {
5861
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5862
0
                    }
5863
5864
0
                    for (int i = 0; i < n_layer; ++i) {
5865
0
                        auto & layer = layers[i];
5866
5867
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5868
5869
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5870
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5871
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5872
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5873
5874
                        // optional bias tensors
5875
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5876
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5877
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5878
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5879
5880
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5881
5882
0
                        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5883
0
                            int n_ff_exp = hparams.n_ff_exp;
5884
5885
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
5886
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5887
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
5888
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
5889
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
5890
5891
                            // Shared expert (if present)
5892
0
                            if (hparams.n_ff_shexp > 0) {
5893
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
5894
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
5895
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
5896
0
                            }
5897
0
                        } else { // Dense layers
5898
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5899
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5900
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5901
0
                        }
5902
0
                    }
5903
0
                } break;
5904
0
            case LLM_ARCH_FALCON_H1:
5905
0
                {
5906
                    // Common
5907
0
                    const int64_t hidden_size = hparams.n_embd; // hidden_size
5908
5909
                    // mamba2 Mixer SSM params
5910
0
                    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
5911
0
                    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
5912
0
                    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
5913
0
                    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
5914
0
                    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
5915
0
                    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
5916
0
                    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
5917
5918
                    // attn params
5919
0
                    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
5920
0
                    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
5921
5922
                    // ffn params
5923
0
                    const int64_t ffn_intermediate_size = hparams.n_ff(0);
5924
5925
                    // embeddings
5926
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
5927
5928
                    // output
5929
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
5930
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
5931
5932
                    // if output is NULL, init from the input tok embed
5933
0
                    if (output == NULL) {
5934
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
5935
0
                    }
5936
5937
0
                    for (int i = 0; i < n_layer; ++i) {
5938
0
                        auto & layer = layers[i];
5939
5940
                        /*SSM LAYERS*/
5941
                        // ssm in
5942
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
5943
                        // ssm 1d conv
5944
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
5945
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
5946
                        // ssm_dt
5947
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
5948
                        // no "weight" suffix for these
5949
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
5950
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
5951
                        // ssm_norm
5952
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
5953
                        // out_proj
5954
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
5955
5956
                        /*ATTENTION LAYERS*/
5957
                        // attention layers (with optional bias)
5958
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
5959
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
5960
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
5961
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
5962
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5963
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
5964
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
5965
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5966
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
5967
5968
5969
                        // feed forward (w/ optional biases)
5970
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
5971
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5972
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
5973
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
5974
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
5975
5976
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
5977
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5978
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
5979
0
                    }
5980
0
                } break;
5981
0
            case LLM_ARCH_HUNYUAN_MOE:
5982
0
                {
5983
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5984
5985
                    // output
5986
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5987
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5988
                    // if output is NULL, init from the input tok embed
5989
0
                    if (output == NULL) {
5990
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5991
0
                    }
5992
5993
0
                    for (int i = 0; i < n_layer; ++i) {
5994
0
                        auto & layer = layers[i];
5995
5996
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5997
5998
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5999
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6000
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6001
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6002
6003
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6004
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6005
6006
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6007
6008
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
6009
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
6010
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
6011
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
6012
6013
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6014
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6015
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
6016
0
                    }
6017
0
                } break;
6018
0
            case LLM_ARCH_HUNYUAN_DENSE:
6019
0
                {
6020
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6021
6022
                    // output
6023
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6024
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6025
                    // if output is NULL, init from the input tok embed
6026
0
                    if (output == NULL) {
6027
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6028
0
                    }
6029
6030
0
                    for (int i = 0; i < n_layer; ++i) {
6031
0
                        auto & layer = layers[i];
6032
6033
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6034
6035
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6036
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6037
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6038
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6039
6040
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6041
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6042
6043
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6044
6045
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6046
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6047
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6048
6049
0
                    }
6050
0
                } break;
6051
0
            case LLM_ARCH_SMOLLM3:
6052
0
                {
6053
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6054
6055
                    // output
6056
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6057
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6058
6059
                    // if output is NULL, init from the input tok embed
6060
0
                    if (output == NULL) {
6061
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6062
0
                    }
6063
6064
0
                    for (int i = 0; i < n_layer; ++i) {
6065
0
                        auto & layer = layers[i];
6066
6067
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6068
6069
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6070
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6071
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6072
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6073
6074
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6075
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6076
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6077
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6078
0
                    }
6079
0
                } break;
6080
0
            case LLM_ARCH_OPENAI_MOE:
6081
0
                {
6082
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
6083
6084
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6085
6086
                    // output
6087
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6088
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6089
6090
0
                    for (int i = 0; i < n_layer; ++i) {
6091
0
                        auto & layer = layers[i];
6092
6093
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
6094
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6095
6096
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
6097
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6098
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6099
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6100
6101
0
                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
6102
6103
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
6104
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6105
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6106
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6107
6108
                        // bias
6109
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_head * n_rot}, 0);
6110
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_head_kv * n_rot}, 0);
6111
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_head_kv * n_rot}, 0);
6112
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6113
6114
0
                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
6115
0
                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6116
0
                        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
6117
0
                        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
6118
0
                    }
6119
0
                } break;
6120
0
            case LLM_ARCH_LFM2:
6121
0
            case LLM_ARCH_LFM2MOE:
6122
0
                {
6123
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
6124
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
6125
0
                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT,          "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6126
6127
0
                    if (output == NULL) {
6128
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6129
0
                    }
6130
6131
0
                    for (int i = 0; i < n_layer; ++i) {
6132
0
                        auto & layer = layers[i];
6133
6134
0
                        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
6135
6136
                        // ffn/moe is same for transformer and conv layers
6137
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6138
0
                        if (is_moe_layer) {
6139
0
                            GGML_ASSERT(n_expert && n_expert_used);
6140
0
                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
6141
0
                            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6142
0
                            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
6143
0
                            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
6144
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6145
0
                        } else {  // dense
6146
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6147
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6148
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6149
0
                        }
6150
6151
                        // for operator_norm
6152
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6153
6154
0
                        if (!hparams.is_recurrent(i)) {
6155
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6156
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6157
0
                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
6158
6159
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6160
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
6161
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
6162
6163
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6164
0
                        } else {
6165
0
                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
6166
0
                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
6167
0
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6168
0
                        }
6169
0
                    }
6170
0
                } break;
6171
0
            case LLM_ARCH_SMALLTHINKER:
6172
0
                {
6173
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6174
6175
                    // output
6176
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6177
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6178
6179
                    // if output is NULL, init from the input tok embed
6180
0
                    if (output == NULL) {
6181
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6182
0
                    }
6183
6184
0
                    for (int i = 0; i < n_layer; ++i) {
6185
0
                        auto & layer = layers[i];
6186
6187
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6188
6189
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6190
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6191
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6192
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6193
6194
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6195
6196
0
                        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
6197
0
                        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
6198
6199
                        // MoE branch
6200
0
                        const int64_t n_ff_exp = hparams.n_ff_exp;
6201
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6202
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6203
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6204
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6205
0
                    }
6206
0
                } break;
6207
0
            case LLM_ARCH_GROVEMOE:
6208
0
                {
6209
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6210
6211
                    // output
6212
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6213
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6214
                    // if output is NULL, init from the input tok embed
6215
0
                    if (output == NULL) {
6216
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6217
0
                    }
6218
6219
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6220
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6221
0
                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6222
6223
0
                    for (int i = 0; i < n_layer; ++i) {
6224
0
                        auto & layer = layers[i];
6225
6226
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6227
6228
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6229
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6230
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6231
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6232
6233
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6234
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6235
6236
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6237
6238
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6239
6240
                        // MoE branch
6241
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6242
0
                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6243
0
                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6244
6245
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6246
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6247
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6248
6249
0
                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6250
0
                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
6251
0
                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6252
0
                    }
6253
0
                } break;
6254
0
            case LLM_ARCH_APERTUS:
6255
0
                {
6256
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6257
6258
                    // output
6259
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6260
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
6261
6262
0
                    for (int i = 0; i < n_layer; ++i) {
6263
0
                        auto & layer = layers[i];
6264
6265
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6266
6267
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6268
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6269
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6270
0
                        } else {
6271
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6272
0
                        }
6273
6274
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6275
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_gqa }, 0);
6276
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_gqa }, 0);
6277
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6278
6279
                        // optional bias tensors
6280
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6281
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6282
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6283
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6284
6285
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6286
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6287
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6288
6289
                        // Q and K layernorms for Apertus
6290
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6291
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6292
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6293
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6294
0
                    }
6295
0
                } break;
6296
0
            case LLM_ARCH_MINIMAX_M2:
6297
0
                {
6298
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6299
6300
                    // output
6301
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6302
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6303
6304
0
                    for (int i = 0; i < n_layer; ++i) {
6305
0
                        auto & layer = layers[i];
6306
6307
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6308
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6309
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6310
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6311
6312
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6313
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
6314
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
6315
6316
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6317
6318
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6319
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
6320
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
6321
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
6322
0
                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6323
0
                    }
6324
0
                } break;
6325
0
            case LLM_ARCH_COGVLM:
6326
0
                {
6327
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6328
6329
                    // output
6330
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6331
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6332
6333
                    // if output is NULL, init from the input tok embed
6334
0
                    if (output == NULL) {
6335
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6336
0
                    }
6337
6338
0
                    for (int i = 0; i < n_layer; ++i) {
6339
0
                        auto & layer = layers[i];
6340
6341
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6342
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6343
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6344
6345
0
                        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6346
0
                        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6347
6348
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6349
6350
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6351
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6352
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6353
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6354
6355
0
                        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6356
0
                        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6357
0
                        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6358
0
                    }
6359
0
                } break;
6360
0
            case LLM_ARCH_PANGU_EMBED:
6361
0
                {
6362
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6363
6364
                    // output
6365
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6366
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6367
6368
                    // if output is NULL, init from the input tok embed
6369
0
                    if (output == NULL) {
6370
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6371
0
                    }
6372
6373
0
                    for (int i = 0; i < n_layer; ++i) {
6374
0
                        auto & layer = layers[i];
6375
6376
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6377
6378
                        // weight tensors
6379
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6380
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6381
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6382
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6383
6384
                        // bias tensors
6385
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, 0);
6386
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
6387
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
6388
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6389
6390
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6391
6392
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6393
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6394
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6395
0
                        } else {
6396
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6397
0
                        }
6398
6399
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6400
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6401
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6402
0
                    }
6403
0
                } break;
6404
0
            default:
6405
0
                throw std::runtime_error("unknown architecture");
6406
0
        }
6407
6408
0
        if (n_moved_tensors > 0) {
6409
0
            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
6410
0
                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
6411
0
                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
6412
0
        }
6413
0
    }
6414
6415
0
    ml.done_getting_tensors();
6416
6417
0
    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
6418
0
    pimpl->mappings.reserve(ml.mappings.size());
6419
6420
    // create the backend buffers
6421
0
    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6422
0
    ctx_buf_maps.reserve(ctx_map.size());
6423
6424
    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6425
0
    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6426
0
    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6427
6428
0
    for (auto & [buft, ctx_ptr] : ctx_map) {
6429
0
        ggml_context * ctx = ctx_ptr.get();
6430
6431
        // skip contexts without tensors
6432
0
        if (ggml_get_first_tensor(ctx) == nullptr) {
6433
0
            continue;
6434
0
        }
6435
6436
0
        llama_buf_map buf_map;
6437
0
        buf_map.reserve(n_max_backend_buffer);
6438
6439
        // check if it is possible to use buffer_from_host_ptr with this buffer type
6440
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
6441
0
        if (!dev) {
6442
            // FIXME: workaround for CPU backend buft having a NULL device
6443
0
            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
6444
0
            if (!dev) {
6445
0
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
6446
0
            }
6447
0
        }
6448
0
        ggml_backend_dev_props props;
6449
0
        ggml_backend_dev_get_props(dev, &props);
6450
0
        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6451
0
        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6452
6453
0
        std::vector<ggml_backend_buffer_ptr> bufs;
6454
0
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6455
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6456
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
6457
                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6458
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6459
0
                void * addr = nullptr;
6460
0
                size_t first, last; // NOLINT
6461
0
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
6462
0
                if (first >= last) {
6463
0
                    continue;
6464
0
                }
6465
0
                const size_t max_size = ggml_get_max_tensor_size(ctx);
6466
0
                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6467
0
                if (buf == nullptr) {
6468
0
                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6469
0
                }
6470
0
                bufs.emplace_back(buf);
6471
0
                buf_map.emplace(idx, buf);
6472
0
            }
6473
0
        }
6474
0
        else {
6475
0
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6476
0
            if (buf == nullptr) {
6477
0
                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6478
0
            }
6479
0
            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
6480
0
                pimpl->mlock_bufs.emplace_back(new llama_mlock);
6481
0
                auto & mlock_buf = pimpl->mlock_bufs.back();
6482
0
                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
6483
0
                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
6484
0
            }
6485
0
            bufs.emplace_back(buf);
6486
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6487
0
                buf_map.emplace(idx, buf);
6488
0
            }
6489
0
        }
6490
0
        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
6491
6492
0
        for (auto & buf : buf_map) {
6493
            // indicate that this buffer contains weights
6494
            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
6495
0
            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6496
0
        }
6497
6498
0
        ctx_buf_maps.emplace_back(ctx, buf_map);
6499
0
    }
6500
6501
0
    if (llama_supports_gpu_offload()) {
6502
0
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
6503
6504
0
        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6505
0
        if (n_gpu_layers > (int) hparams.n_layer) {
6506
0
            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6507
0
        }
6508
6509
0
        const int max_backend_supported_layers = hparams.n_layer + 1;
6510
0
        const int max_offloadable_layers       = hparams.n_layer + 1;
6511
6512
0
        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
6513
0
    }
6514
6515
    // print memory requirements per buffer type
6516
0
    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6517
0
        for (auto & buf: bufs) {
6518
0
            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6519
0
                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6520
0
        }
6521
0
    }
6522
6523
    // populate tensors_by_name
6524
0
    for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6525
0
        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
6526
0
            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
6527
0
        }
6528
0
    }
6529
6530
    // load tensor data
6531
0
    for (auto & [ctx, buf_map] : ctx_buf_maps) {
6532
0
        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6533
0
            return false;
6534
0
        }
6535
0
    }
6536
6537
0
    if (use_mmap_buffer) {
6538
0
        for (auto & mapping : ml.mappings) {
6539
0
            pimpl->mappings.emplace_back(std::move(mapping));
6540
0
        }
6541
0
    }
6542
6543
0
    return true;
6544
0
}
6545
6546
0
std::string llama_model::arch_name() const {
6547
0
    return llm_arch_name(arch);
6548
0
}
6549
6550
0
std::string llama_model::type_name() const {
6551
0
    return llm_type_name(type);
6552
0
}
6553
6554
0
std::string llama_model::desc() const {
6555
0
    return pimpl->desc_str;
6556
0
}
6557
6558
0
size_t llama_model::size() const {
6559
0
    return pimpl->n_bytes;
6560
0
}
6561
6562
0
size_t llama_model::n_tensors() const {
6563
0
    return tensors_by_name.size();
6564
0
}
6565
6566
0
size_t llama_model::n_devices() const {
6567
0
    return devices.size();
6568
0
}
6569
6570
0
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6571
0
    std::map<ggml_backend_buffer_type_t, size_t> ret;
6572
0
    for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6573
0
        for (const auto & buf : bufs) {
6574
0
            ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6575
0
        }
6576
0
    }
6577
0
    return ret;
6578
0
}
6579
6580
0
uint64_t llama_model::n_elements() const {
6581
0
    return pimpl->n_elements;
6582
0
}
6583
6584
0
void llama_model::print_info() const {
6585
0
    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
6586
6587
0
    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
6588
0
        bool is_var = false;
6589
6590
0
        std::vector<uint32_t> v;
6591
0
        for (uint32_t i = 0; i < n; ++i) {
6592
0
            v.push_back(f(i));
6593
0
            if (v[i] != v[0]) {
6594
0
                is_var = true;
6595
0
            }
6596
0
        }
6597
6598
0
        std::stringstream ss;
6599
6600
0
        if (is_var) {
6601
0
            ss << "[";
6602
0
            for (uint32_t i = 0; i < n; ++i) {
6603
0
                ss << v[i];
6604
0
                if (i < n - 1) {
6605
0
                    ss << ", ";
6606
0
                }
6607
0
            }
6608
0
            ss << "]";
6609
0
        } else {
6610
0
            ss << v[0];
6611
0
        }
6612
6613
0
        return ss.str();
6614
0
    };
6615
6616
    // hparams
6617
0
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, arch_name().c_str());
6618
0
    LLAMA_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
6619
6620
0
    if (!hparams.vocab_only) {
6621
0
        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
6622
0
        LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
6623
0
        LLAMA_LOG_INFO("%s: n_embd_inp       = %u\n",     __func__, hparams.n_embd_inp());
6624
0
        LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
6625
0
        LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
6626
0
        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
6627
0
        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
6628
0
        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
6629
0
        LLAMA_LOG_INFO("%s: is_swa_any       = %u\n",     __func__, hparams.is_swa_any());
6630
0
        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
6631
0
        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
6632
0
        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
6633
0
        LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
6634
0
        LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
6635
0
        LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
6636
0
        LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
6637
0
        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
6638
0
        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
6639
0
        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
6640
0
        LLAMA_LOG_INFO("%s: f_attn_scale     = %.1e\n",   __func__, hparams.f_attention_scale);
6641
0
        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
6642
0
        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
6643
0
        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
6644
0
        LLAMA_LOG_INFO("%s: n_expert_groups  = %d\n",     __func__, hparams.n_expert_groups);
6645
0
        LLAMA_LOG_INFO("%s: n_group_used     = %d\n",     __func__, hparams.n_group_used);
6646
0
        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
6647
0
        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
6648
0
        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
6649
0
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
6650
0
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
6651
0
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
6652
0
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
6653
0
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
6654
        // MRoPE (Multi-axis Rotary Position Embedding) sections
6655
0
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
6656
0
            LLAMA_LOG_INFO("%s: mrope sections   = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
6657
0
        }
6658
0
        if (!classifier_labels.empty()) {
6659
0
            LLAMA_LOG_INFO("%s: n_cls_out        = %u\n", __func__, hparams.n_cls_out);
6660
6661
0
            size_t i = 0;
6662
0
            for (auto label : classifier_labels) {
6663
0
                LLAMA_LOG_INFO("%s: cls_label[%2zu]    = %s\n", __func__, i++, label.c_str());
6664
0
            }
6665
0
        }
6666
0
    }
6667
6668
0
    if (arch == LLM_ARCH_MAMBA ||
6669
0
        arch == LLM_ARCH_MAMBA2 ||
6670
0
        arch == LLM_ARCH_JAMBA ||
6671
0
        arch == LLM_ARCH_FALCON_H1 ||
6672
0
        arch == LLM_ARCH_PLAMO2 ||
6673
0
        arch == LLM_ARCH_GRANITE_HYBRID ||
6674
0
        arch == LLM_ARCH_NEMOTRON_H) {
6675
0
        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
6676
0
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
6677
0
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
6678
0
        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
6679
0
        LLAMA_LOG_INFO("%s: ssm_n_group      = %u\n",     __func__, hparams.ssm_n_group);
6680
0
        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
6681
0
    }
6682
6683
0
    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
6684
0
    if (pimpl->n_elements >= 1e12) {
6685
0
        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, pimpl->n_elements*1e-12);
6686
0
    } else if (pimpl->n_elements >= 1e9) {
6687
0
        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, pimpl->n_elements*1e-9);
6688
0
    } else if (pimpl->n_elements >= 1e6) {
6689
0
        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, pimpl->n_elements*1e-6);
6690
0
    } else {
6691
0
        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, pimpl->n_elements*1e-3);
6692
0
    }
6693
6694
    // general kv
6695
0
    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, name.c_str());
6696
6697
0
    if (arch == LLM_ARCH_DEEPSEEK) {
6698
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6699
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6700
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6701
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6702
0
    }
6703
6704
0
    if (arch == LLM_ARCH_DEEPSEEK2) {
6705
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6706
0
        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
6707
0
        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
6708
0
        LLAMA_LOG_INFO("%s: n_embd_head_k_mla    = %d\n",     __func__, hparams.n_embd_head_k_mla);
6709
0
        LLAMA_LOG_INFO("%s: n_embd_head_v_mla    = %d\n",     __func__, hparams.n_embd_head_v_mla);
6710
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6711
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6712
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6713
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6714
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6715
0
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
6716
0
    }
6717
6718
0
    if (arch == LLM_ARCH_QWEN2MOE) {
6719
0
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
6720
0
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
6721
0
    }
6722
6723
0
    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
6724
0
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
6725
0
    }
6726
6727
0
    if (arch == LLM_ARCH_MINICPM ||
6728
0
        arch == LLM_ARCH_GRANITE ||
6729
0
        arch == LLM_ARCH_GRANITE_MOE ||
6730
0
        arch == LLM_ARCH_GRANITE_HYBRID) {
6731
0
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6732
0
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
6733
0
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6734
0
        LLAMA_LOG_INFO("%s: n_ff_shexp        = %d\n", __func__, hparams.n_ff_shexp);
6735
0
    }
6736
6737
0
    if (arch == LLM_ARCH_BAILINGMOE) {
6738
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6739
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6740
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6741
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6742
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6743
0
    }
6744
6745
0
    if (arch == LLM_ARCH_BAILINGMOE2) {
6746
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6747
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6748
0
        LLAMA_LOG_INFO("%s: n_ff_shexp           = %d\n",     __func__, hparams.n_ff_shexp);
6749
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6750
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6751
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6752
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6753
0
        LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n",     __func__, hparams.nextn_predict_layers);
6754
0
    }
6755
6756
0
    if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6757
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6758
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6759
0
    }
6760
6761
0
    if (arch == LLM_ARCH_GROVEMOE) {
6762
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6763
0
        LLAMA_LOG_INFO("%s: n_ff_chexp           = %d\n",     __func__, hparams.n_ff_chexp);
6764
0
        LLAMA_LOG_INFO("%s: n_group_experts      = %d\n",     __func__, hparams.n_group_experts);
6765
0
        LLAMA_LOG_INFO("%s: expert_group_scale   = %.2f\n",   __func__, hparams.expert_group_scale);
6766
0
    }
6767
6768
0
    vocab.print_info();
6769
0
}
6770
6771
0
ggml_backend_dev_t llama_model::dev_layer(int il) const {
6772
0
    return pimpl->dev_layer.at(il).dev;
6773
0
}
6774
6775
0
ggml_backend_dev_t llama_model::dev_output() const {
6776
0
    return pimpl->dev_output.dev;
6777
0
}
6778
6779
template<typename F>
6780
0
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
6781
0
    ggml_init_params params = {
6782
0
        /*.mem_size   =*/ ggml_tensor_overhead()*8,
6783
0
        /*.mem_buffer =*/ NULL,
6784
0
        /*.no_alloc   =*/ true,
6785
0
    };
6786
6787
0
    ggml_context_ptr ctx { ggml_init(params) };
6788
0
    if (!ctx) {
6789
0
        throw std::runtime_error(format("failed to create ggml context"));
6790
0
    }
6791
6792
0
    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
6793
0
    ggml_tensor * op_tensor = fn(ctx.get());
6794
0
    for (int i = 0; i < GGML_MAX_SRC; i++) {
6795
0
        if (op_tensor->src[i] != nullptr) {
6796
0
            assert(op_tensor->src[i]->buffer == nullptr);
6797
0
            op_tensor->src[i]->buffer = buf.get();
6798
0
        }
6799
0
    }
6800
6801
0
    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
6802
6803
0
    return op_supported;
6804
0
}
6805
6806
template<typename F>
6807
0
static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
6808
0
    for (const auto & cur : buft_list) {
6809
0
        ggml_backend_dev_t cur_dev = cur.first;
6810
0
        ggml_backend_buffer_type_t cur_buft = cur.second;
6811
0
        if (buft_supported(cur_buft, cur_dev, fn)) {
6812
0
            return cur_buft;
6813
0
        }
6814
0
    }
6815
6816
0
    throw std::runtime_error(format("no suitable buffer type found"));
6817
0
}
6818
6819
0
ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
6820
0
    return ::select_buft(
6821
0
            *pimpl->dev_layer.at(il).buft_list,
6822
0
            [&](ggml_context * ctx) {
6823
0
                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
6824
0
                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
6825
0
                return ggml_add(ctx, cur, layer_dir);
6826
0
            });
6827
0
}
6828
6829
0
bool llama_model::has_tensor_overrides() const {
6830
0
    return pimpl->has_tensor_overrides;
6831
0
}
6832
6833
0
const ggml_tensor * llama_model::get_tensor(const char * name) const {
6834
0
    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
6835
0
            [name](const std::pair<std::string, ggml_tensor *> & it) {
6836
0
                return it.first == name;
6837
0
            });
6838
0
    if (it == tensors_by_name.end()) {
6839
0
        return nullptr;
6840
0
    }
6841
6842
0
    return it->second;
6843
0
}
6844
6845
0
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
6846
0
    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
6847
0
}
6848
6849
0
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
6850
0
    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
6851
0
}
6852
6853
0
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
6854
0
    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
6855
6856
    // choose long/short freq factors based on the context size
6857
0
    if (layers[il].rope_freqs != nullptr) {
6858
0
        return layers[il].rope_freqs;
6859
0
    }
6860
6861
0
    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
6862
0
        return layers[il].rope_long;
6863
0
    }
6864
6865
0
    return layers[il].rope_short;
6866
0
}
6867
6868
0
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
6869
0
    llama_memory_i * res;
6870
6871
0
    switch (arch) {
6872
        // Models that need specific instantiation should be handled in the
6873
        // switch statement
6874
0
        case LLM_ARCH_BERT:
6875
0
        case LLM_ARCH_JINA_BERT_V2:
6876
0
        case LLM_ARCH_JINA_BERT_V3:
6877
0
        case LLM_ARCH_NOMIC_BERT:
6878
0
        case LLM_ARCH_NOMIC_BERT_MOE:
6879
0
        case LLM_ARCH_NEO_BERT:
6880
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
6881
0
        case LLM_ARCH_GEMMA_EMBEDDING:
6882
0
        case LLM_ARCH_DREAM:
6883
0
        case LLM_ARCH_LLADA:
6884
0
        case LLM_ARCH_LLADA_MOE:
6885
0
            {
6886
0
                res = nullptr;
6887
0
            } break;
6888
        // Models that need standard caching should rely on recurrent/hybrid
6889
        // checks
6890
0
        default:
6891
0
            {
6892
0
                if (llm_arch_is_recurrent(arch)) {
6893
0
                    res = new llama_memory_recurrent(
6894
0
                            *this,
6895
0
                            GGML_TYPE_F32,
6896
0
                            GGML_TYPE_F32,
6897
0
                            cparams.offload_kqv,
6898
0
                            std::max((uint32_t) 1, cparams.n_seq_max),
6899
0
                            cparams.n_seq_max,
6900
0
                            nullptr);
6901
0
                } else if (llm_arch_is_hybrid(arch)) {
6902
6903
                    // The main difference between hybrid architectures is the
6904
                    // layer filters, so pick the right one here
6905
0
                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
6906
0
                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
6907
0
                    if (arch == LLM_ARCH_FALCON_H1) {
6908
0
                        filter_attn = [&](int32_t) { return true; };
6909
0
                        filter_recr = [&](int32_t) { return true; };
6910
0
                    } else if (arch == LLM_ARCH_NEMOTRON_H) {
6911
0
                        filter_attn = [&](int32_t il) {
6912
0
                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
6913
0
                        };
6914
0
                        filter_recr = [&](int32_t il) {
6915
0
                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
6916
0
                        };
6917
0
                    }
6918
6919
0
                    res = new llama_memory_hybrid(
6920
0
                        /* model             */ *this,
6921
0
                        /* attn_type_k       */ params.type_k,
6922
0
                        /* attn_type_v       */ params.type_v,
6923
0
                        /* attn_v_trans      */ !cparams.flash_attn,
6924
0
                        /* attn_kv_size      */ cparams.n_ctx,
6925
0
                        /* attn_n_pad        */ 1,
6926
0
                        /* attn_n_swa        */ hparams.n_swa,
6927
0
                        /* attn_swa_type     */ hparams.swa_type,
6928
0
                        /* recurrent_type_k  */ GGML_TYPE_F32,
6929
0
                        /* recurrent_type_v  */ GGML_TYPE_F32,
6930
0
                        /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
6931
0
                        /* n_seq_max         */ cparams.n_seq_max,
6932
0
                        /* offload           */ cparams.offload_kqv,
6933
0
                        /* unified           */ cparams.kv_unified,
6934
0
                        /* filter_attn       */ std::move(filter_attn),
6935
0
                        /* filter_recr       */ std::move(filter_recr));
6936
0
                } else {
6937
0
                    llama_memory_i::layer_reuse_cb reuse = nullptr;
6938
6939
0
                    if (arch == LLM_ARCH_GEMMA3N) {
6940
0
                        reuse = [&](int32_t il) {
6941
0
                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
6942
0
                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
6943
0
                            }
6944
6945
0
                            return -1;
6946
0
                        };
6947
0
                    }
6948
6949
0
                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
6950
0
                        GGML_ASSERT(hparams.is_swa_any());
6951
6952
0
                        res = new llama_kv_cache_iswa(
6953
0
                                *this,
6954
0
                                params.type_k,
6955
0
                                params.type_v,
6956
0
                                !cparams.flash_attn,
6957
0
                                cparams.offload_kqv,
6958
0
                                params.swa_full,
6959
0
                                cparams.kv_unified,
6960
0
                                cparams.n_ctx_seq,
6961
0
                                cparams.n_seq_max,
6962
0
                                cparams.n_ubatch,
6963
0
                                1,
6964
0
                                nullptr,
6965
0
                                reuse);
6966
0
                    } else {
6967
0
                        GGML_ASSERT(!hparams.is_swa_any());
6968
6969
0
                        res = new llama_kv_cache(
6970
0
                                *this,
6971
0
                                params.type_k,
6972
0
                                params.type_v,
6973
0
                                !cparams.flash_attn,
6974
0
                                cparams.offload_kqv,
6975
0
                                cparams.kv_unified,
6976
0
                                cparams.n_ctx_seq,
6977
0
                                cparams.n_seq_max,
6978
0
                                1,
6979
0
                                hparams.n_swa,
6980
0
                                hparams.swa_type,
6981
0
                                nullptr,
6982
0
                                nullptr);
6983
0
                    }
6984
0
                }
6985
0
            }
6986
0
    }
6987
6988
0
    return res;
6989
0
}
6990
6991
0
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
6992
0
    std::unique_ptr<llm_graph_context> llm;
6993
6994
0
    switch (arch) {
6995
0
        case LLM_ARCH_LLAMA:
6996
0
            {
6997
0
                llm = std::make_unique<llm_build_llama>(*this, params);
6998
0
            } break;
6999
0
        case LLM_ARCH_LLAMA4:
7000
0
            {
7001
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
7002
0
                    llm = std::make_unique<llm_build_llama>(*this, params);
7003
0
                } else {
7004
0
                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
7005
0
                }
7006
0
            } break;
7007
0
        case LLM_ARCH_DECI:
7008
0
            {
7009
0
                llm = std::make_unique<llm_build_deci>(*this, params);
7010
0
            } break;
7011
0
        case LLM_ARCH_BAICHUAN:
7012
0
            {
7013
0
                llm = std::make_unique<llm_build_baichuan>(*this, params);
7014
0
            } break;
7015
0
        case LLM_ARCH_FALCON:
7016
0
            {
7017
0
                llm = std::make_unique<llm_build_falcon>(*this, params);
7018
0
            } break;
7019
0
        case LLM_ARCH_GROK:
7020
0
            {
7021
0
                llm = std::make_unique<llm_build_grok>(*this, params);
7022
0
            } break;
7023
0
        case LLM_ARCH_STARCODER:
7024
0
            {
7025
0
                llm = std::make_unique<llm_build_starcoder>(*this, params);
7026
0
            } break;
7027
0
        case LLM_ARCH_REFACT:
7028
0
            {
7029
0
                llm = std::make_unique<llm_build_refact>(*this, params);
7030
0
            } break;
7031
0
        case LLM_ARCH_BERT:
7032
0
        case LLM_ARCH_JINA_BERT_V2:
7033
0
        case LLM_ARCH_JINA_BERT_V3:
7034
0
        case LLM_ARCH_NOMIC_BERT:
7035
0
        case LLM_ARCH_NOMIC_BERT_MOE:
7036
0
            {
7037
0
                llm = std::make_unique<llm_build_bert>(*this, params);
7038
0
            } break;
7039
0
        case LLM_ARCH_NEO_BERT:
7040
0
            {
7041
0
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
7042
0
            } break;
7043
0
        case LLM_ARCH_BLOOM:
7044
0
            {
7045
0
                llm = std::make_unique<llm_build_bloom>(*this, params);
7046
0
            } break;
7047
0
        case LLM_ARCH_MPT:
7048
0
            {
7049
0
                llm = std::make_unique<llm_build_mpt>(*this, params);
7050
0
            } break;
7051
0
        case LLM_ARCH_STABLELM:
7052
0
            {
7053
0
                llm = std::make_unique<llm_build_stablelm>(*this, params);
7054
0
            } break;
7055
0
        case LLM_ARCH_QWEN:
7056
0
            {
7057
0
                llm = std::make_unique<llm_build_qwen>(*this, params);
7058
0
            } break;
7059
0
        case LLM_ARCH_QWEN2:
7060
0
            {
7061
0
                llm = std::make_unique<llm_build_qwen2>(*this, params);
7062
0
            } break;
7063
0
        case LLM_ARCH_DREAM:
7064
0
            {
7065
0
                llm = std::make_unique<llm_build_dream>(*this, params);
7066
0
            }
7067
0
            break;
7068
0
        case LLM_ARCH_LLADA:
7069
0
            {
7070
0
                llm = std::make_unique<llm_build_llada>(*this, params);
7071
0
            }
7072
0
            break;
7073
0
        case LLM_ARCH_LLADA_MOE:
7074
0
            {
7075
0
                llm = std::make_unique<llm_build_llada_moe>(*this, params);
7076
0
            }
7077
0
            break;
7078
0
        case LLM_ARCH_QWEN2VL:
7079
0
            {
7080
0
                llm = std::make_unique<llm_build_qwen2vl>(*this, params);
7081
0
            } break;
7082
0
        case LLM_ARCH_QWEN2MOE:
7083
0
            {
7084
0
                llm = std::make_unique<llm_build_qwen2moe>(*this, params);
7085
0
            } break;
7086
0
        case LLM_ARCH_QWEN3:
7087
0
            {
7088
0
                llm = std::make_unique<llm_build_qwen3>(*this, params);
7089
0
            } break;
7090
0
        case LLM_ARCH_QWEN3MOE:
7091
0
            {
7092
0
                llm = std::make_unique<llm_build_qwen3moe>(*this, params);
7093
0
            } break;
7094
0
        case LLM_ARCH_QWEN3VL:
7095
0
            {
7096
0
                llm = std::make_unique<llm_build_qwen3vl>(*this, params);
7097
0
            } break;
7098
0
        case LLM_ARCH_QWEN3VLMOE:
7099
0
            {
7100
0
                llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
7101
0
            } break;
7102
0
        case LLM_ARCH_PHI2:
7103
0
            {
7104
0
                llm = std::make_unique<llm_build_phi2>(*this, params);
7105
0
            } break;
7106
0
        case LLM_ARCH_PHI3:
7107
0
        case LLM_ARCH_PHIMOE:
7108
0
            {
7109
0
                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7110
0
                    llm = std::make_unique<llm_build_phi3<true>> (*this, params);
7111
0
                } else {
7112
0
                    llm = std::make_unique<llm_build_phi3<false>>(*this, params);
7113
0
                }
7114
0
            } break;
7115
0
        case LLM_ARCH_PLAMO:
7116
0
            {
7117
0
                llm = std::make_unique<llm_build_plamo>(*this, params);
7118
0
            } break;
7119
0
        case LLM_ARCH_PLAMO2:
7120
0
            {
7121
0
                llm = std::make_unique<llm_build_plamo2>(*this, params);
7122
0
            } break;
7123
0
        case LLM_ARCH_GPT2:
7124
0
            {
7125
0
                llm = std::make_unique<llm_build_gpt2>(*this, params);
7126
0
            } break;
7127
0
        case LLM_ARCH_CODESHELL:
7128
0
            {
7129
0
                llm = std::make_unique<llm_build_codeshell>(*this, params);
7130
0
            } break;
7131
0
        case LLM_ARCH_ORION:
7132
0
            {
7133
0
                llm = std::make_unique<llm_build_orion>(*this, params);
7134
0
            } break;
7135
0
        case LLM_ARCH_INTERNLM2:
7136
0
            {
7137
0
                llm = std::make_unique<llm_build_internlm2>(*this, params);
7138
0
            } break;
7139
0
        case LLM_ARCH_MINICPM3:
7140
0
            {
7141
0
                llm = std::make_unique<llm_build_minicpm3>(*this, params);
7142
0
            } break;
7143
0
        case LLM_ARCH_GEMMA:
7144
0
            {
7145
0
                llm = std::make_unique<llm_build_gemma>(*this, params);
7146
0
            } break;
7147
0
        case LLM_ARCH_GEMMA2:
7148
0
            {
7149
0
                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
7150
0
            } break;
7151
0
        case LLM_ARCH_GEMMA3:
7152
0
            {
7153
0
                llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
7154
0
            } break;
7155
0
        case LLM_ARCH_GEMMA3N:
7156
0
            {
7157
0
                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
7158
0
            } break;
7159
0
        case LLM_ARCH_GEMMA_EMBEDDING:
7160
0
            {
7161
0
                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
7162
0
            } break;
7163
0
        case LLM_ARCH_STARCODER2:
7164
0
            {
7165
0
                llm = std::make_unique<llm_build_starcoder2>(*this, params);
7166
0
            } break;
7167
0
        case LLM_ARCH_MAMBA:
7168
0
        case LLM_ARCH_MAMBA2:
7169
0
            {
7170
0
                llm = std::make_unique<llm_build_mamba>(*this, params);
7171
0
            } break;
7172
0
        case LLM_ARCH_JAMBA:
7173
0
            {
7174
0
                llm = std::make_unique<llm_build_jamba>(*this, params);
7175
0
            } break;
7176
0
        case LLM_ARCH_XVERSE:
7177
0
            {
7178
0
                llm = std::make_unique<llm_build_xverse>(*this, params);
7179
0
            } break;
7180
0
        case LLM_ARCH_COMMAND_R:
7181
0
            {
7182
0
                llm = std::make_unique<llm_build_command_r>(*this, params);
7183
0
            } break;
7184
0
        case LLM_ARCH_COHERE2:
7185
0
            {
7186
0
                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
7187
0
            } break;
7188
0
        case LLM_ARCH_DBRX:
7189
0
            {
7190
0
                llm = std::make_unique<llm_build_dbrx>(*this, params);
7191
0
            } break;
7192
0
        case LLM_ARCH_OLMO:
7193
0
            {
7194
0
                llm = std::make_unique<llm_build_olmo>(*this, params);
7195
0
            } break;
7196
0
        case LLM_ARCH_OLMO2:
7197
0
            {
7198
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7199
0
                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
7200
0
                } else {
7201
0
                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
7202
0
                }
7203
0
            } break;
7204
0
        case LLM_ARCH_OLMOE:
7205
0
            {
7206
0
                llm = std::make_unique<llm_build_olmoe>(*this, params);
7207
0
            } break;
7208
0
        case LLM_ARCH_OPENELM:
7209
0
            {
7210
0
                llm = std::make_unique<llm_build_openelm>(*this, params);
7211
0
            } break;
7212
0
        case LLM_ARCH_GPTNEOX:
7213
0
            {
7214
0
                llm = std::make_unique<llm_build_gptneox>(*this, params);
7215
0
            } break;
7216
0
        case LLM_ARCH_ARCTIC:
7217
0
            {
7218
0
                llm = std::make_unique<llm_build_arctic>(*this, params);
7219
0
            } break;
7220
0
        case LLM_ARCH_DEEPSEEK:
7221
0
            {
7222
0
                llm = std::make_unique<llm_build_deepseek>(*this, params);
7223
0
            } break;
7224
0
        case LLM_ARCH_DEEPSEEK2:
7225
0
            {
7226
0
                llm = std::make_unique<llm_build_deepseek2>(*this, params);
7227
0
            } break;
7228
0
        case LLM_ARCH_CHATGLM:
7229
0
            {
7230
0
                llm = std::make_unique<llm_build_chatglm>(*this, params);
7231
0
            } break;
7232
0
        case LLM_ARCH_GLM4:
7233
0
            {
7234
0
                llm = std::make_unique<llm_build_glm4>(*this, params);
7235
0
            } break;
7236
0
        case LLM_ARCH_GLM4_MOE:
7237
0
            {
7238
0
                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
7239
0
            } break;
7240
0
        case LLM_ARCH_BITNET:
7241
0
            {
7242
0
                llm = std::make_unique<llm_build_bitnet>(*this, params);
7243
0
            } break;
7244
0
        case LLM_ARCH_T5:
7245
0
            {
7246
0
                switch (params.gtype) {
7247
0
                    case LLM_GRAPH_TYPE_ENCODER:
7248
0
                        llm = std::make_unique<llm_build_t5_enc>(*this, params);
7249
0
                        break;
7250
0
                    case LLM_GRAPH_TYPE_DEFAULT:
7251
0
                    case LLM_GRAPH_TYPE_DECODER:
7252
0
                        llm = std::make_unique<llm_build_t5_dec>(*this, params);
7253
0
                        break;
7254
0
                    default:
7255
0
                        GGML_ABORT("invalid graph type");
7256
0
                };
7257
0
            } break;
7258
0
        case LLM_ARCH_T5ENCODER:
7259
0
            {
7260
0
                llm = std::make_unique<llm_build_t5_enc>(*this, params);
7261
0
            }
7262
0
            break;
7263
0
        case LLM_ARCH_JAIS:
7264
0
            {
7265
0
                llm = std::make_unique<llm_build_jais>(*this, params);
7266
0
            } break;
7267
0
        case LLM_ARCH_NEMOTRON:
7268
0
            {
7269
0
                llm = std::make_unique<llm_build_nemotron>(*this, params);
7270
0
            } break;
7271
0
        case LLM_ARCH_NEMOTRON_H:
7272
0
            {
7273
0
                llm = std::make_unique<llm_build_nemotron_h>(*this, params);
7274
0
            } break;
7275
0
        case LLM_ARCH_EXAONE:
7276
0
            {
7277
0
                llm = std::make_unique<llm_build_exaone>(*this, params);
7278
0
            } break;
7279
0
        case LLM_ARCH_EXAONE4:
7280
0
            {
7281
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7282
0
                    llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
7283
0
                } else {
7284
0
                    llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
7285
0
                }
7286
0
            } break;
7287
0
        case LLM_ARCH_RWKV6:
7288
0
            {
7289
0
                llm = std::make_unique<llm_build_rwkv6>(*this, params);
7290
0
            } break;
7291
0
        case LLM_ARCH_RWKV6QWEN2:
7292
0
            {
7293
0
                llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
7294
0
            } break;
7295
0
        case LLM_ARCH_RWKV7:
7296
0
            {
7297
0
                llm = std::make_unique<llm_build_rwkv7>(*this, params);
7298
0
            } break;
7299
0
        case LLM_ARCH_ARWKV7:
7300
0
            {
7301
0
                llm = std::make_unique<llm_build_arwkv7>(*this, params);
7302
0
            } break;
7303
0
        case LLM_ARCH_GRANITE:
7304
0
        case LLM_ARCH_GRANITE_MOE:
7305
0
        case LLM_ARCH_MINICPM:
7306
0
            {
7307
0
                llm = std::make_unique<llm_build_granite>(*this, params);
7308
0
            } break;
7309
0
        case LLM_ARCH_GRANITE_HYBRID:
7310
0
            {
7311
0
                llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
7312
0
            } break;
7313
0
        case LLM_ARCH_CHAMELEON:
7314
0
            {
7315
0
                llm = std::make_unique<llm_build_chameleon>(*this, params);
7316
0
            } break;
7317
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
7318
0
            {
7319
0
                llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
7320
0
            } break;
7321
0
        case LLM_ARCH_PLM:
7322
0
            {
7323
0
                llm = std::make_unique<llm_build_plm>(*this, params);
7324
0
            } break;
7325
0
        case LLM_ARCH_BAILINGMOE:
7326
0
            {
7327
0
                llm = std::make_unique<llm_build_bailingmoe>(*this, params);
7328
0
            } break;
7329
0
        case LLM_ARCH_BAILINGMOE2:
7330
0
            {
7331
0
                llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
7332
0
            } break;
7333
0
        case LLM_ARCH_SEED_OSS:
7334
0
            {
7335
0
                llm = std::make_unique<llm_build_seed_oss>(*this, params);
7336
0
            } break;
7337
0
        case LLM_ARCH_DOTS1:
7338
0
            {
7339
0
                llm = std::make_unique<llm_build_dots1>(*this, params);
7340
0
            } break;
7341
0
        case LLM_ARCH_ARCEE:
7342
0
            {
7343
0
                llm = std::make_unique<llm_build_arcee>(*this, params);
7344
0
            } break;
7345
0
        case LLM_ARCH_AFMOE:
7346
0
            {
7347
0
                llm = std::make_unique<llm_build_afmoe>(*this, params);
7348
0
            } break;
7349
0
        case LLM_ARCH_ERNIE4_5:
7350
0
            {
7351
0
                llm = std::make_unique<llm_build_ernie4_5>(*this, params);
7352
0
            } break;
7353
0
        case LLM_ARCH_ERNIE4_5_MOE:
7354
0
            {
7355
0
                llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
7356
0
            } break;
7357
0
        case LLM_ARCH_HUNYUAN_MOE:
7358
0
            {
7359
0
                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
7360
0
            } break;
7361
0
        case LLM_ARCH_HUNYUAN_DENSE:
7362
0
            {
7363
0
                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
7364
0
            } break;
7365
0
        case LLM_ARCH_SMOLLM3:
7366
0
            {
7367
0
                llm = std::make_unique<llm_build_smollm3>(*this, params);
7368
0
            } break;
7369
0
        case LLM_ARCH_OPENAI_MOE:
7370
0
            {
7371
0
                llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
7372
0
            } break;
7373
0
        case LLM_ARCH_FALCON_H1:
7374
0
            {
7375
0
                llm = std::make_unique<llm_build_falcon_h1>(*this, params);
7376
0
            } break;
7377
0
        case LLM_ARCH_LFM2:
7378
0
        case LLM_ARCH_LFM2MOE:
7379
0
            {
7380
0
                llm = std::make_unique<llm_build_lfm2>(*this, params);
7381
0
            } break;
7382
0
        case LLM_ARCH_SMALLTHINKER:
7383
0
            {
7384
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7385
0
                    llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
7386
0
                } else {
7387
0
                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
7388
0
                }
7389
0
            } break;
7390
0
        case LLM_ARCH_GROVEMOE:
7391
0
            {
7392
0
                llm = std::make_unique<llm_build_grovemoe>(*this, params);
7393
0
            } break;
7394
0
        case LLM_ARCH_APERTUS:
7395
0
            {
7396
0
                llm = std::make_unique<llm_build_apertus>(*this, params);
7397
0
            } break;
7398
0
        case LLM_ARCH_MINIMAX_M2:
7399
0
            {
7400
0
                llm = std::make_unique<llm_build_minimax_m2>(*this, params);
7401
0
            } break;
7402
0
        case LLM_ARCH_COGVLM:
7403
0
            {
7404
0
                llm = std::make_unique<llm_build_cogvlm>(*this, params);
7405
0
            } break;
7406
0
        case LLM_ARCH_PANGU_EMBED:
7407
0
            {
7408
0
                llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
7409
0
            }break;
7410
0
        default:
7411
0
            GGML_ABORT("fatal error");
7412
0
    }
7413
7414
    // add on pooling layer
7415
0
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7416
7417
    // if the gguf model was converted with --sentence-transformers-dense-modules
7418
    // there will be two additional dense projection layers
7419
    // dense linear projections are applied after pooling
7420
    // TODO: move reranking logic here and generalize
7421
0
    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7422
7423
0
    return llm->res->get_gf();
7424
0
}
7425
7426
7427
//
7428
// interface implementation
7429
//
7430
7431
0
llama_model_params llama_model_default_params() {
7432
0
    llama_model_params result = {
7433
0
        /*.devices                     =*/ nullptr,
7434
0
        /*.tensor_buft_overrides       =*/ nullptr,
7435
0
        /*.n_gpu_layers                =*/ 999,
7436
0
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
7437
0
        /*.main_gpu                    =*/ 0,
7438
0
        /*.tensor_split                =*/ nullptr,
7439
0
        /*.progress_callback           =*/ nullptr,
7440
0
        /*.progress_callback_user_data =*/ nullptr,
7441
0
        /*.kv_overrides                =*/ nullptr,
7442
0
        /*.vocab_only                  =*/ false,
7443
0
        /*.use_mmap                    =*/ true,
7444
0
        /*.use_mlock                   =*/ false,
7445
0
        /*.check_tensors               =*/ false,
7446
0
        /*.use_extra_bufts             =*/ true,
7447
0
        /*.no_host                     =*/ false,
7448
0
    };
7449
7450
0
    return result;
7451
0
}
7452
7453
0
const llama_vocab * llama_model_get_vocab(const llama_model * model) {
7454
0
    return &model->vocab;
7455
0
}
7456
7457
0
void llama_free_model(llama_model * model) {
7458
0
    llama_model_free(model);
7459
0
}
7460
7461
0
void llama_model_free(llama_model * model) {
7462
0
    delete model;
7463
0
}
7464
7465
0
int32_t llama_model_n_ctx_train(const llama_model * model) {
7466
0
    return model->hparams.n_ctx_train;
7467
0
}
7468
7469
0
int32_t llama_model_n_embd(const llama_model * model) {
7470
0
    return model->hparams.n_embd;
7471
0
}
7472
7473
0
int32_t llama_model_n_embd_inp(const llama_model * model) {
7474
0
    return model->hparams.n_embd_inp();
7475
0
}
7476
7477
0
int32_t llama_model_n_layer(const llama_model * model) {
7478
0
    return model->hparams.n_layer;
7479
0
}
7480
7481
0
int32_t llama_model_n_head(const llama_model * model) {
7482
0
    return model->hparams.n_head();
7483
0
}
7484
7485
0
int32_t llama_model_n_head_kv(const llama_model * model) {
7486
0
    return model->hparams.n_head_kv();
7487
0
}
7488
7489
0
int32_t llama_model_n_swa(const llama_model * model) {
7490
0
    return model->hparams.n_swa;
7491
0
}
7492
7493
0
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
7494
0
    return model->hparams.n_cls_out;
7495
0
}
7496
7497
0
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
7498
0
    if (i < model->classifier_labels.size()) {
7499
0
        return model->classifier_labels[i].c_str();
7500
0
    }
7501
7502
0
    return nullptr;
7503
0
}
7504
7505
// deprecated
7506
0
int32_t llama_n_ctx_train(const llama_model * model) {
7507
0
    return llama_model_n_ctx_train(model);
7508
0
}
7509
7510
// deprecated
7511
0
int32_t llama_n_embd(const llama_model * model) {
7512
0
    return llama_model_n_embd(model);
7513
0
}
7514
7515
// deprecated
7516
0
int32_t llama_n_layer(const llama_model * model) {
7517
0
    return llama_model_n_layer(model);
7518
0
}
7519
7520
// deprecated
7521
0
int32_t llama_n_head(const llama_model * model) {
7522
0
    return llama_model_n_head(model);
7523
0
}
7524
7525
0
llama_rope_type llama_model_rope_type(const llama_model * model) {
7526
0
    switch (model->arch) {
7527
        // these models do not use RoPE
7528
0
        case LLM_ARCH_CLIP:
7529
0
        case LLM_ARCH_GPT2:
7530
0
        case LLM_ARCH_GPTJ:
7531
0
        case LLM_ARCH_MPT:
7532
0
        case LLM_ARCH_REFACT:
7533
0
        case LLM_ARCH_BLOOM:
7534
0
        case LLM_ARCH_MAMBA:
7535
0
        case LLM_ARCH_MAMBA2:
7536
0
        case LLM_ARCH_JAMBA:
7537
0
        case LLM_ARCH_JINA_BERT_V2:
7538
0
        case LLM_ARCH_T5:
7539
0
        case LLM_ARCH_T5ENCODER:
7540
0
        case LLM_ARCH_JAIS:
7541
0
        case LLM_ARCH_RWKV6:
7542
0
        case LLM_ARCH_RWKV6QWEN2:
7543
0
        case LLM_ARCH_RWKV7:
7544
0
        case LLM_ARCH_ARWKV7:
7545
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
7546
0
        case LLM_ARCH_NEMOTRON_H:
7547
0
            return LLAMA_ROPE_TYPE_NONE;
7548
7549
        // use what we call a normal RoPE, operating on pairs of consecutive head values
7550
0
        case LLM_ARCH_LLAMA:
7551
0
        case LLM_ARCH_LLADA:
7552
0
        case LLM_ARCH_LLAMA4:
7553
0
        case LLM_ARCH_DECI:
7554
0
        case LLM_ARCH_BAICHUAN:
7555
0
        case LLM_ARCH_STARCODER:
7556
0
        case LLM_ARCH_INTERNLM2:
7557
0
        case LLM_ARCH_MINICPM:
7558
0
        case LLM_ARCH_XVERSE:
7559
0
        case LLM_ARCH_COMMAND_R:
7560
0
        case LLM_ARCH_COHERE2:
7561
0
        case LLM_ARCH_OLMO:
7562
0
        case LLM_ARCH_ARCTIC:
7563
0
        case LLM_ARCH_DEEPSEEK:
7564
0
        case LLM_ARCH_DEEPSEEK2:
7565
0
        case LLM_ARCH_PLM:
7566
0
        case LLM_ARCH_CHATGLM:
7567
0
        case LLM_ARCH_GLM4:
7568
0
        case LLM_ARCH_GRANITE:
7569
0
        case LLM_ARCH_GRANITE_MOE:
7570
0
        case LLM_ARCH_GRANITE_HYBRID:
7571
0
        case LLM_ARCH_CHAMELEON:
7572
0
        case LLM_ARCH_BAILINGMOE:
7573
0
        case LLM_ARCH_NEO_BERT:
7574
0
        case LLM_ARCH_SMOLLM3:
7575
0
        case LLM_ARCH_ARCEE:
7576
0
        case LLM_ARCH_ERNIE4_5:
7577
0
        case LLM_ARCH_ERNIE4_5_MOE:
7578
0
            return LLAMA_ROPE_TYPE_NORM;
7579
7580
        // the pairs of head values are offset by n_rot/2
7581
0
        case LLM_ARCH_FALCON:
7582
0
        case LLM_ARCH_FALCON_H1:
7583
0
        case LLM_ARCH_GROK:
7584
0
        case LLM_ARCH_DBRX:
7585
0
        case LLM_ARCH_BERT:
7586
0
        case LLM_ARCH_JINA_BERT_V3:
7587
0
        case LLM_ARCH_NOMIC_BERT:
7588
0
        case LLM_ARCH_NOMIC_BERT_MOE:
7589
0
        case LLM_ARCH_STABLELM:
7590
0
        case LLM_ARCH_BITNET:
7591
0
        case LLM_ARCH_QWEN:
7592
0
        case LLM_ARCH_QWEN2:
7593
0
        case LLM_ARCH_DREAM:
7594
0
        case LLM_ARCH_QWEN2MOE:
7595
0
        case LLM_ARCH_QWEN3:
7596
0
        case LLM_ARCH_QWEN3MOE:
7597
0
        case LLM_ARCH_LLADA_MOE:
7598
0
        case LLM_ARCH_OLMO2:
7599
0
        case LLM_ARCH_OLMOE:
7600
0
        case LLM_ARCH_PHI2:
7601
0
        case LLM_ARCH_PHI3:
7602
0
        case LLM_ARCH_PHIMOE:
7603
0
        case LLM_ARCH_PLAMO:
7604
0
        case LLM_ARCH_PLAMO2:
7605
0
        case LLM_ARCH_GEMMA:
7606
0
        case LLM_ARCH_GEMMA2:
7607
0
        case LLM_ARCH_GEMMA3:
7608
0
        case LLM_ARCH_GEMMA3N:
7609
0
        case LLM_ARCH_GEMMA_EMBEDDING:
7610
0
        case LLM_ARCH_STARCODER2:
7611
0
        case LLM_ARCH_OPENELM:
7612
0
        case LLM_ARCH_GPTNEOX:
7613
0
        case LLM_ARCH_CODESHELL:
7614
0
        case LLM_ARCH_ORION:
7615
0
        case LLM_ARCH_NEMOTRON:
7616
0
        case LLM_ARCH_EXAONE:
7617
0
        case LLM_ARCH_EXAONE4:
7618
0
        case LLM_ARCH_MINICPM3:
7619
0
        case LLM_ARCH_BAILINGMOE2:
7620
0
        case LLM_ARCH_DOTS1:
7621
0
        case LLM_ARCH_HUNYUAN_MOE:
7622
0
        case LLM_ARCH_OPENAI_MOE:
7623
0
        case LLM_ARCH_HUNYUAN_DENSE:
7624
0
        case LLM_ARCH_LFM2:
7625
0
        case LLM_ARCH_LFM2MOE:
7626
0
        case LLM_ARCH_SMALLTHINKER:
7627
0
        case LLM_ARCH_GLM4_MOE:
7628
0
        case LLM_ARCH_SEED_OSS:
7629
0
        case LLM_ARCH_GROVEMOE:
7630
0
        case LLM_ARCH_APERTUS:
7631
0
        case LLM_ARCH_MINIMAX_M2:
7632
0
        case LLM_ARCH_COGVLM:
7633
0
        case LLM_ARCH_PANGU_EMBED:
7634
0
        case LLM_ARCH_AFMOE:
7635
0
            return LLAMA_ROPE_TYPE_NEOX;
7636
7637
0
        case LLM_ARCH_QWEN2VL:
7638
0
            return LLAMA_ROPE_TYPE_MROPE;
7639
0
        case LLM_ARCH_QWEN3VL:
7640
0
        case LLM_ARCH_QWEN3VLMOE:
7641
0
            return LLAMA_ROPE_TYPE_IMROPE;
7642
7643
        // all model arches should be listed explicitly here
7644
0
        case LLM_ARCH_UNKNOWN:
7645
0
            GGML_ABORT("unknown architecture");
7646
0
    }
7647
7648
0
    return LLAMA_ROPE_TYPE_NONE;
7649
0
}
7650
7651
0
float llama_model_rope_freq_scale_train(const llama_model * model) {
7652
0
    return model->hparams.rope_freq_scale_train;
7653
0
}
7654
7655
0
int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
7656
0
    const auto & it = model->gguf_kv.find(key);
7657
0
    if (it == model->gguf_kv.end()) {
7658
0
        if (buf_size > 0) {
7659
0
            buf[0] = '\0';
7660
0
        }
7661
0
        return -1;
7662
0
    }
7663
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
7664
0
}
7665
7666
0
int32_t llama_model_meta_count(const llama_model * model) {
7667
0
    return (int)model->gguf_kv.size();
7668
0
}
7669
7670
0
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
7671
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
7672
0
        if (buf_size > 0) {
7673
0
            buf[0] = '\0';
7674
0
        }
7675
0
        return -1;
7676
0
    }
7677
0
    auto it = model->gguf_kv.begin();
7678
0
    std::advance(it, i);
7679
0
    return snprintf(buf, buf_size, "%s", it->first.c_str());
7680
0
}
7681
7682
0
int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
7683
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
7684
0
        if (buf_size > 0) {
7685
0
            buf[0] = '\0';
7686
0
        }
7687
0
        return -1;
7688
0
    }
7689
0
    auto it = model->gguf_kv.begin();
7690
0
    std::advance(it, i);
7691
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
7692
0
}
7693
7694
0
int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
7695
0
    return snprintf(buf, buf_size, "%s", model->desc().c_str());
7696
0
}
7697
7698
0
uint64_t llama_model_size(const llama_model * model) {
7699
0
    return model->size();
7700
0
}
7701
7702
0
const char * llama_model_chat_template(const llama_model * model, const char * name) {
7703
0
    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
7704
0
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
7705
0
    const auto & it = model->gguf_kv.find(key);
7706
0
    if (it == model->gguf_kv.end()) {
7707
        // one-off fix for very popular models (so we are not flooded with issues)
7708
        // do not extend this list unless absolutely necessary
7709
        // Mistral-Small-2503 does not have built-in chat template
7710
0
        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
7711
0
        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
7712
0
            return "mistral-v7-tekken";
7713
0
        }
7714
7715
0
        return nullptr;
7716
0
    }
7717
7718
0
    return it->second.c_str();
7719
0
}
7720
7721
0
uint64_t llama_model_n_params(const llama_model * model) {
7722
0
    return model->n_elements();
7723
0
}
7724
7725
0
bool llama_model_has_encoder(const llama_model * model) {
7726
0
    switch (model->arch) {
7727
0
        case LLM_ARCH_T5:        return true;
7728
0
        case LLM_ARCH_T5ENCODER: return true;
7729
0
        default:                 return false;
7730
0
    }
7731
0
}
7732
7733
0
bool llama_model_has_decoder(const llama_model * model) {
7734
0
    switch (model->arch) {
7735
0
        case LLM_ARCH_T5ENCODER: return false;
7736
0
        default:                 return true;
7737
0
    }
7738
0
}
7739
7740
0
llama_token llama_model_decoder_start_token(const llama_model * model) {
7741
0
    return model->hparams.dec_start_token_id;
7742
0
}
7743
7744
0
bool llama_model_is_recurrent(const llama_model * model) {
7745
0
    return llm_arch_is_recurrent(model->arch);
7746
0
}
7747
7748
0
bool llama_model_is_hybrid(const llama_model * model) {
7749
0
    return llm_arch_is_hybrid(model->arch);
7750
0
}
7751
7752
0
bool llama_model_is_diffusion(const llama_model * model) {
7753
0
    return llm_arch_is_diffusion(model->arch);
7754
0
}
7755
7756
0
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
7757
0
    return model->tensors_by_name;
7758
0
}