Coverage Report

Created: 2026-03-21 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama.cpp
Line
Count
Source
1
#include "llama.h"
2
3
#include "ggml-cpp.h"
4
#include "llama-impl.h"
5
6
#include "llama-chat.h"
7
#include "llama-context.h"
8
#include "llama-mmap.h"
9
#include "llama-vocab.h"
10
#include "llama-model-loader.h"
11
#include "llama-model-saver.h"
12
#include "llama-model.h"
13
14
#include "ggml.h"
15
#include "ggml-backend.h"
16
#include "gguf.h"
17
18
#include <algorithm>
19
#include <cassert>
20
#include <cinttypes>
21
#include <cstddef>
22
#include <cstdint>
23
#include <cstdio>
24
#include <cstring>
25
#include <ctime>
26
#include <stdexcept>
27
28
#if defined(_MSC_VER)
29
#pragma warning(disable: 4244 4267) // possible loss of data
30
#endif
31
32
//
33
// interface implementation
34
//
35
36
0
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
37
0
    switch (flash_attn_type) {
38
0
        case LLAMA_FLASH_ATTN_TYPE_AUTO:
39
0
            return "auto";
40
0
        case LLAMA_FLASH_ATTN_TYPE_DISABLED:
41
0
            return "disabled";
42
0
        case LLAMA_FLASH_ATTN_TYPE_ENABLED:
43
0
            return "enabled";
44
0
    }
45
0
    GGML_ABORT("fatal error");
46
0
}
47
48
struct llama_device_memory_data {
49
    int64_t total;
50
    int64_t free;
51
    llama_memory_breakdown_data mb;
52
};
53
54
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
55
        const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
56
        std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
57
0
        const ggml_log_level log_level) {
58
0
    struct user_data_t {
59
0
        struct {
60
0
            ggml_log_callback callback;
61
0
            void * user_data;
62
0
        } original_logger;
63
0
        ggml_log_level min_level; // prints below this log level go to debug log
64
0
    };
65
0
    user_data_t ud;
66
0
    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
67
0
    ud.min_level = log_level;
68
69
0
    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
70
0
        const user_data_t * ud = (const user_data_t *) user_data;
71
0
        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
72
0
        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
73
0
    }, &ud);
74
75
0
    llama_model_params mparams_copy = *mparams;
76
0
    mparams_copy.no_alloc  = true;
77
0
    mparams_copy.use_mmap  = false;
78
0
    mparams_copy.use_mlock = false;
79
80
0
    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
81
0
    if (model == nullptr) {
82
0
        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
83
0
        throw std::runtime_error("failed to load model");
84
0
    }
85
86
0
    llama_context * ctx = llama_init_from_model(model, *cparams);
87
0
    if (ctx == nullptr) {
88
0
        llama_model_free(model);
89
0
        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
90
0
        throw std::runtime_error("failed to create llama_context from model");
91
0
    }
92
93
0
    std::vector<llama_device_memory_data> ret(model->devices.size());
94
95
0
    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
96
97
0
    for (const auto & [buft, mb] : memory_breakdown) {
98
0
        if (ggml_backend_buft_is_host(buft)) {
99
0
            continue;
100
0
        }
101
102
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
103
0
        if (!dev) {
104
0
            continue;
105
0
        }
106
0
        for (size_t i = 0; i < ret.size(); i++) {
107
0
            if (model->devices[i] == dev) {
108
0
                ret[i].mb.model   += mb.model;
109
0
                ret[i].mb.context += mb.context;
110
0
                ret[i].mb.compute += mb.compute;
111
0
                break;
112
0
            }
113
0
        }
114
0
    }
115
0
    for (size_t i = 0; i < ret.size(); i++) {
116
0
        size_t free;
117
0
        size_t total;
118
0
        ggml_backend_dev_memory(model->devices[i], &free, &total);
119
120
        // devices can return 0 bytes for free and total memory if they do not
121
        // have any to report. in this case, we will use the host memory as a fallback
122
        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
123
0
        if (free == 0 && total == 0) {
124
0
            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
125
0
            if (cpu_dev == nullptr) {
126
0
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
127
0
            }
128
0
            ggml_backend_dev_memory(cpu_dev, &free, &total);
129
0
        }
130
0
        ret[i].free  = free;
131
0
        ret[i].total = total;
132
0
    }
133
134
0
    devs           = model->devices;
135
0
    hp_ngl         = model->hparams.n_layer;
136
0
    hp_n_ctx_train = model->hparams.n_ctx_train;
137
0
    hp_n_expert    = model->hparams.n_expert;
138
139
0
    llama_memory_breakdown_print(ctx); // goes to debug log
140
141
0
    llama_free(ctx);
142
0
    llama_model_free(model);
143
0
    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
144
0
    return ret;
145
0
}
146
147
// enum to identify part of a layer for distributing its tensors:
148
enum layer_fraction_t {
149
    LAYER_FRACTION_NONE = 0, // nothing
150
    LAYER_FRACTION_ATTN = 1, // attention
151
    LAYER_FRACTION_UP   = 2, // attention + up
152
    LAYER_FRACTION_GATE = 3, // attention + up + gate
153
    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
154
};
155
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
156
157
class llama_params_fit_exception : public std::runtime_error {
158
    using std::runtime_error::runtime_error;
159
};
160
161
static void llama_params_fit_impl(
162
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
163
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
164
0
        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
165
0
    constexpr int64_t MiB = 1024*1024;
166
0
    typedef std::vector<llama_device_memory_data> dmds_t;
167
0
    const llama_model_params default_mparams = llama_model_default_params();
168
169
0
    std::vector<ggml_backend_dev_t> devs;
170
0
    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
171
0
    uint32_t hp_nct = 0; // hparams.n_ctx_train
172
0
    uint32_t hp_nex = 0; // hparams.n_expert
173
174
    // step 1: get data for default parameters and check whether any changes are necessary in the first place
175
176
0
    LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
177
0
    const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
178
0
    const size_t nd = devs.size(); // number of devices
179
0
    if (nd == 0) {
180
0
        LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
181
0
        return;
182
0
    }
183
184
0
    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
185
0
    margins.reserve(nd);
186
0
    for (size_t id = 0; id < nd; id++) {
187
0
        margins.push_back(margins_s[id]);
188
0
    }
189
190
0
    std::vector<std::string> dev_names;
191
0
    {
192
0
        dev_names.reserve(nd);
193
0
        size_t max_length = 0;
194
0
        for (ggml_backend_dev_t dev : devs) {
195
0
            std::string name = ggml_backend_dev_name(dev);
196
0
            name += " (";
197
0
            name += ggml_backend_dev_description(dev);
198
0
            name += ")";
199
0
            dev_names.push_back(name);
200
0
            max_length = std::max(max_length, name.length());
201
0
        }
202
0
        for (std::string & dn : dev_names) {
203
0
            dn.insert(dn.end(), max_length - dn.length(), ' ');
204
0
        }
205
0
    }
206
207
0
    int64_t sum_free            = 0;
208
0
    int64_t sum_projected_free  = 0;
209
0
    int64_t sum_projected_used  = 0;
210
0
    int64_t sum_projected_model = 0;
211
0
    std::vector<int64_t> projected_free_per_device;
212
0
    projected_free_per_device.reserve(nd);
213
214
0
    if (nd > 1) {
215
0
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
216
0
    }
217
0
    for (size_t id = 0; id < nd; id++) {
218
0
        const llama_device_memory_data & dmd = dmds_full[id];
219
220
0
        const int64_t projected_used = dmd.mb.total();
221
0
        const int64_t projected_free = dmd.free - projected_used;
222
0
        projected_free_per_device.push_back(projected_free);
223
224
0
        sum_free            += dmd.free;
225
0
        sum_projected_used  += projected_used;
226
0
        sum_projected_free  += projected_free;
227
0
        sum_projected_model += dmd.mb.model;
228
229
0
        if (nd > 1) {
230
0
            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
231
0
                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
232
0
        }
233
0
    }
234
0
    assert(sum_free >= 0 && sum_projected_used >= 0);
235
0
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
236
0
        __func__, sum_projected_used/MiB, sum_free/MiB);
237
0
    if (nd == 1) {
238
0
        if (projected_free_per_device[0] >= margins[0]) {
239
0
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
240
0
                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
241
0
            return;
242
0
        }
243
0
    } else {
244
0
        bool changes_needed = false;
245
0
        for (size_t id = 0; id < nd; id++) {
246
0
            if (projected_free_per_device[id] < margins[id]) {
247
0
                changes_needed = true;
248
0
                break;
249
0
            }
250
0
        }
251
0
        if (!changes_needed) {
252
0
            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
253
0
            return;
254
0
        }
255
0
    }
256
257
    // step 2: try reducing memory use by reducing the context size
258
259
0
    {
260
0
        int64_t global_surplus = sum_projected_free;
261
0
        for (size_t id = 0; id < nd; id++) {
262
0
            global_surplus -= margins[id];
263
0
        }
264
0
        if (global_surplus < 0) {
265
0
            if (nd == 1) {
266
0
                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
267
0
                    __func__, margins[0]/MiB, -global_surplus/MiB);
268
0
            } else {
269
0
                LLAMA_LOG_INFO(
270
0
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
271
0
                    __func__, -global_surplus/MiB);
272
0
            }
273
0
            if (cparams->n_ctx == 0) {
274
0
                if (hp_nct > n_ctx_min) {
275
0
                    int64_t sum_used_target = sum_free;
276
0
                    for (size_t id = 0; id < nd; id++) {
277
0
                        sum_used_target -= margins[id];
278
0
                    }
279
0
                    if (nd > 1) {
280
                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
281
                        //   - for dense models only whole layers can be assigned to devices
282
                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
283
                        //   - on average we expect a waste of 0.5 layers/tensors per device
284
                        //   - use slightly more than the expected average for nd devices to be safe
285
0
                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
286
0
                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
287
0
                    }
288
289
0
                    int64_t sum_projected_used_min_ctx = 0;
290
0
                    cparams->n_ctx = n_ctx_min;
291
0
                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
292
0
                    for (const auto & dmd : dmds_min_ctx) {
293
0
                        sum_projected_used_min_ctx += dmd.mb.total();
294
0
                    }
295
0
                    if (sum_used_target > sum_projected_used_min_ctx) {
296
                        // linear interpolation between minimum and maximum context size:
297
0
                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
298
0
                            / (sum_projected_used - sum_projected_used_min_ctx);
299
0
                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
300
301
0
                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
302
0
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
303
0
                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
304
0
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
305
0
                        if (nd == 1) {
306
0
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
307
0
                            return;
308
0
                        }
309
0
                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
310
0
                    } else {
311
0
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
312
0
                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
313
0
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
314
0
                    }
315
0
                } else {
316
0
                    if (n_ctx_min == UINT32_MAX) {
317
0
                        LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
318
0
                    } else {
319
0
                        LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
320
0
                            __func__, hp_nct, n_ctx_min);
321
0
                    }
322
0
                }
323
0
            } else {
324
0
                LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
325
0
            }
326
0
        }
327
0
    }
328
329
0
    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
330
0
        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
331
0
    }
332
0
    if (nd > 1) {
333
0
        if (!tensor_split) {
334
0
            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
335
0
        }
336
0
        if (mparams->tensor_split) {
337
0
            for (size_t id = 0; id < nd; id++) {
338
0
                if (mparams->tensor_split[id] != 0.0f) {
339
0
                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
340
0
                }
341
0
            }
342
0
        }
343
0
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
344
0
            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
345
0
        }
346
0
    }
347
0
    if (!tensor_buft_overrides) {
348
0
        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
349
0
    }
350
0
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
351
0
        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
352
0
    }
353
354
    // step 3: iteratively fill the back to front with "dense" layers
355
    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
356
    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
357
358
    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
359
0
    auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
360
0
        constexpr size_t n_strings = 1000;
361
0
        if (il >= n_strings) {
362
0
            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
363
0
        }
364
0
        switch (lf) {
365
0
            case LAYER_FRACTION_ATTN: {
366
0
                static std::array<std::string, n_strings> patterns;
367
0
                if (patterns[il].empty()) {
368
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
369
0
                }
370
0
                return patterns[il].c_str();
371
0
            }
372
0
            case LAYER_FRACTION_UP: {
373
0
                static std::array<std::string, n_strings> patterns;
374
0
                if (patterns[il].empty()) {
375
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
376
0
                }
377
0
                return patterns[il].c_str();
378
0
            }
379
0
            case LAYER_FRACTION_GATE: {
380
0
                static std::array<std::string, n_strings> patterns;
381
0
                if (patterns[il].empty()) {
382
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
383
0
                }
384
0
                return patterns[il].c_str();
385
0
            }
386
0
            case LAYER_FRACTION_MOE: {
387
0
                static std::array<std::string, n_strings> patterns;
388
0
                if (patterns[il].empty()) {
389
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
390
0
                }
391
0
                return patterns[il].c_str();
392
0
            }
393
0
            default:
394
0
                GGML_ABORT("fatal error");
395
0
        }
396
0
    };
397
398
0
    struct ngl_t {
399
0
        uint32_t n_layer = 0; // number of total layers
400
0
        uint32_t n_part  = 0; // number of partial layers, <= n_layer
401
402
        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
403
0
        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
404
405
0
        uint32_t n_full() const {
406
0
            assert(n_layer >= n_part);
407
0
            return n_layer - n_part;
408
0
        }
409
0
    };
410
411
0
    const size_t ntbo = llama_max_tensor_buft_overrides();
412
413
    // utility function to set n_gpu_layers and tensor_split
414
0
    auto set_ngl_tensor_split_tbo = [&](
415
0
            const std::vector<ngl_t> & ngl_per_device,
416
0
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
417
0
            llama_model_params & mparams) {
418
0
        mparams.n_gpu_layers = 0;
419
0
        for (size_t id = 0; id < nd; id++) {
420
0
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
421
0
            if (nd > 1) {
422
0
                tensor_split[id] = ngl_per_device[id].n_layer;
423
0
            }
424
0
        }
425
0
        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
426
0
        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
427
428
0
        mparams.tensor_split = tensor_split;
429
430
0
        size_t itbo = 0;
431
0
        for (size_t id = 0; id < nd; id++) {
432
0
            il0 += ngl_per_device[id].n_full();
433
0
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
434
0
                if (itbo + 1 >= ntbo) {
435
0
                    tensor_buft_overrides[itbo].pattern = nullptr;
436
0
                    tensor_buft_overrides[itbo].buft    = nullptr;
437
0
                    itbo++;
438
0
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
439
0
                    throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
440
0
                        + std::to_string(ntbo) + " is insufficient for model");
441
0
                }
442
0
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
443
0
                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
444
0
                itbo++;
445
0
            }
446
0
            il0 += ngl_per_device[id].n_part;
447
0
        }
448
0
        tensor_buft_overrides[itbo].pattern = nullptr;
449
0
        tensor_buft_overrides[itbo].buft    = nullptr;
450
0
        itbo++;
451
0
        mparams.tensor_buft_overrides = tensor_buft_overrides;
452
0
    };
453
454
    // utility function that returns the memory use per device for given numbers of layers per device
455
0
    auto get_memory_for_layers = [&](
456
0
            const char * func_name,
457
0
            const std::vector<ngl_t> & ngl_per_device,
458
0
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
459
0
        llama_model_params mparams_copy = *mparams;
460
0
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
461
462
0
        const dmds_t dmd_nl = llama_get_device_memory_data(
463
0
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
464
465
0
        LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
466
0
        for (size_t id = 0; id < nd; id++) {
467
0
            const ngl_t & n = ngl_per_device[id];
468
0
            LLAMA_LOG_DEBUG(
469
0
                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
470
0
                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
471
0
        }
472
473
0
        std::vector<int64_t> ret;
474
0
        ret.reserve(nd);
475
0
        for (const llama_device_memory_data & dmd : dmd_nl) {
476
0
            ret.push_back(dmd.mb.total());
477
0
        }
478
0
        return ret;
479
0
    };
480
481
0
    int64_t global_surplus_cpu_moe = 0;
482
0
    if (hp_nex > 0) {
483
0
        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
484
0
        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
485
0
        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
486
0
        tensor_buft_overrides[1] = {nullptr, nullptr};
487
0
        mparams->tensor_buft_overrides = tensor_buft_overrides;
488
489
0
        LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
490
0
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
491
0
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
492
493
0
        for (size_t id = 0; id < nd; id++) {
494
0
            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
495
0
            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
496
0
        }
497
498
0
        if (global_surplus_cpu_moe > 0) {
499
0
            LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
500
0
                __func__, global_surplus_cpu_moe/MiB);
501
0
        } else {
502
0
            LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
503
0
                __func__, -global_surplus_cpu_moe/MiB);
504
0
        }
505
506
        // reset
507
0
        tensor_buft_overrides[0] = {nullptr, nullptr};
508
0
        mparams->tensor_buft_overrides = tensor_buft_overrides;
509
0
    }
510
511
0
    std::vector<int64_t> targets; // maximum acceptable memory use per device
512
0
    targets.reserve(nd);
513
0
    for (size_t id = 0; id < nd; id++) {
514
0
        targets.push_back(dmds_full[id].free - margins[id]);
515
0
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
516
0
    }
517
518
0
    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
519
0
    overflow_bufts.reserve(nd);
520
0
    for (size_t id = 0; id < nd; id++) {
521
0
        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
522
0
    }
523
524
0
    std::vector<ngl_t> ngl_per_device(nd);
525
0
    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
526
527
    // optimize the number of layers per device using the method of false position:
528
    //   - ngl_per_device has 0 layers for each device, lower bound
529
    //   - try a "high" configuration where a device is given all unassigned layers
530
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
531
    //   - check memory use of our guess, replace either the low or high bound
532
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
533
    //   - the last device has the output layer, which cannot be a partial layer
534
0
    if (hp_nex == 0) {
535
0
        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
536
0
    } else {
537
0
        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
538
0
    }
539
0
    for (int id = nd - 1; id >= 0; id--) {
540
0
        uint32_t n_unassigned = hp_ngl + 1;
541
0
        for (size_t jd = id + 1; jd < nd; ++jd) {
542
0
            assert(n_unassigned >= ngl_per_device[jd].n_layer);
543
0
            n_unassigned -= ngl_per_device[jd].n_layer;
544
0
        }
545
546
0
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
547
0
        ngl_per_device_high[id].n_layer = n_unassigned;
548
0
        if (hp_nex > 0) {
549
0
            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
550
0
        }
551
0
        if (ngl_per_device_high[id].n_layer > 0) {
552
0
            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
553
0
            if (mem_high[id] > targets[id]) {
554
0
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
555
0
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
556
0
                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
557
0
                while (delta > 1) {
558
0
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
559
0
                    step_size = std::max(step_size, uint32_t(1));
560
0
                    step_size = std::min(step_size, delta - 1);
561
562
0
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
563
0
                    ngl_per_device_test[id].n_layer += step_size;
564
0
                    if (hp_nex) {
565
0
                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
566
0
                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
567
0
                    }
568
0
                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
569
570
0
                    if (mem_test[id] <= targets[id]) {
571
0
                        ngl_per_device = ngl_per_device_test;
572
0
                        mem            = mem_test;
573
0
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
574
0
                    } else {
575
0
                        ngl_per_device_high = ngl_per_device_test;
576
0
                        mem_high            = mem_test;
577
0
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
578
0
                    }
579
0
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
580
0
                }
581
0
            } else {
582
0
                assert(ngl_per_device_high[id].n_layer == n_unassigned);
583
0
                ngl_per_device = ngl_per_device_high;
584
0
                mem            = mem_high;
585
0
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
586
0
            }
587
0
        }
588
589
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
590
0
        LLAMA_LOG_INFO(
591
0
            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
592
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
593
0
    }
594
0
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
595
0
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
596
0
        return;
597
0
    }
598
599
    // step 4: for a MoE model where all dense tensors fit,
600
    //     convert the dense-only layers in the back to full layers in the front until all devices are full
601
    // essentially the same procedure as for the dense-only layers except front-to-back
602
    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
603
604
0
    size_t id_dense_start = nd;
605
0
    for (int id = nd - 1; id >= 0; id--) {
606
0
        if (ngl_per_device[id].n_layer > 0) {
607
0
            id_dense_start = id;
608
0
            continue;
609
0
        }
610
0
        break;
611
0
    }
612
0
    assert(id_dense_start < nd);
613
614
0
    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
615
0
    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
616
0
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
617
0
        for (size_t jd = id_dense_start; jd < nd; jd++) {
618
0
            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
619
0
            ngl_per_device_high[id].n_layer += n_layer_move;
620
0
            ngl_per_device_high[jd].n_layer -= n_layer_move;
621
0
            ngl_per_device_high[jd].n_part = 0;
622
0
        }
623
0
        size_t id_dense_start_high = nd - 1;
624
0
        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
625
626
0
        if (mem_high[id] > targets[id]) {
627
0
            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
628
0
            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
629
0
            while (delta > 1) {
630
0
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
631
0
                step_size = std::max(step_size, uint32_t(1));
632
0
                step_size = std::min(step_size, delta - 1);
633
634
0
                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
635
0
                size_t id_dense_start_test = id_dense_start;
636
0
                uint32_t n_converted_test = 0;
637
0
                for (;id_dense_start_test < nd; id_dense_start_test++) {
638
0
                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
639
0
                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
640
0
                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
641
0
                    ngl_per_device_test[id].n_layer += n_convert_jd;
642
0
                    n_converted_test += n_convert_jd;
643
644
0
                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
645
0
                        break;
646
0
                    }
647
0
                }
648
0
                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
649
650
0
                if (mem_test[id] <= targets[id]) {
651
0
                    ngl_per_device = ngl_per_device_test;
652
0
                    mem            = mem_test;
653
0
                    id_dense_start = id_dense_start_test;
654
0
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
655
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
656
0
                } else {
657
0
                    ngl_per_device_high = ngl_per_device_test;
658
0
                    mem_high            = mem_test;
659
0
                    id_dense_start_high = id_dense_start_test;
660
0
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
661
0
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
662
0
                }
663
0
                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
664
0
                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
665
0
            }
666
0
        } else {
667
0
            ngl_per_device = ngl_per_device_high;
668
0
            mem            = mem_high;
669
0
            id_dense_start = id_dense_start_high;
670
0
            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
671
0
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
672
0
        }
673
674
        // try to fit at least part of one more layer
675
0
        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
676
0
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
677
0
            size_t id_dense_start_test = id_dense_start;
678
0
            ngl_per_device_test[id_dense_start_test].n_layer--;
679
0
            ngl_per_device_test[id_dense_start_test].n_part--;
680
0
            ngl_per_device_test[id].n_layer++;
681
0
            ngl_per_device_test[id].n_part++;
682
0
            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
683
0
                id_dense_start_test++;
684
0
            }
685
0
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
686
0
            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
687
0
            if (id < nd - 1) {
688
0
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
689
0
            }
690
0
            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
691
0
            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
692
0
            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
693
0
                ngl_per_device = ngl_per_device_test;
694
0
                overflow_bufts = overflow_bufts_test;
695
0
                mem            = mem_test;
696
0
                id_dense_start = id_dense_start_test;
697
0
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
698
0
                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
699
700
0
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
701
0
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
702
0
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
703
0
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
704
0
                    ngl_per_device = ngl_per_device_test;
705
0
                    overflow_bufts = overflow_bufts_test;
706
0
                    mem            = mem_test;
707
0
                    id_dense_start = id_dense_start_test;
708
0
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
709
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
710
0
                }
711
0
            } else {
712
0
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
713
0
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
714
0
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
715
0
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
716
0
                    ngl_per_device = ngl_per_device_test;
717
0
                    overflow_bufts = overflow_bufts_test;
718
0
                    mem            = mem_test;
719
0
                    id_dense_start = id_dense_start_test;
720
0
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
721
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
722
0
                }
723
0
            }
724
0
        }
725
726
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
727
0
        LLAMA_LOG_INFO(
728
0
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
729
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
730
0
    }
731
732
    // print info for devices that were not changed during the conversion from dense only to full layers:
733
0
    for (size_t id = id_dense_start + 1; id < nd; id++) {
734
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
735
0
        LLAMA_LOG_INFO(
736
0
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
737
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
738
0
    }
739
740
0
    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
741
0
}
742
743
enum llama_params_fit_status llama_params_fit(
744
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
745
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
746
0
        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
747
0
    const int64_t t0_us = llama_time_us();
748
0
    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
749
0
    try {
750
0
        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
751
0
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
752
0
    } catch (const llama_params_fit_exception & e) {
753
0
        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
754
0
        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
755
0
    } catch (const std::runtime_error & e) {
756
0
        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
757
0
        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
758
0
    }
759
0
    const int64_t t1_us = llama_time_us();
760
0
    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
761
0
    return status;
762
0
}
763
764
0
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
765
0
    struct llama_sampler_chain_params result = {
766
0
        /*.no_perf =*/ true,
767
0
    };
768
769
0
    return result;
770
0
}
771
772
4.91k
size_t llama_max_devices(void) {
773
4.91k
    return 16;
774
4.91k
}
775
776
0
size_t llama_max_tensor_buft_overrides() {
777
0
    return 4096;
778
0
}
779
780
0
bool llama_supports_mmap(void) {
781
0
    return llama_mmap::SUPPORTED;
782
0
}
783
784
0
bool llama_supports_mlock(void) {
785
0
    return llama_mlock::SUPPORTED;
786
0
}
787
788
0
bool llama_supports_gpu_offload(void) {
789
0
    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
790
0
           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
791
0
           llama_supports_rpc();
792
0
}
793
794
0
bool llama_supports_rpc(void) {
795
0
    return ggml_backend_reg_by_name("RPC") != nullptr;
796
0
}
797
798
4.91k
void llama_backend_init(void) {
799
4.91k
    ggml_time_init();
800
801
    // needed to initialize f16 tables
802
4.91k
    {
803
4.91k
        struct ggml_init_params params = { 0, NULL, false };
804
4.91k
        struct ggml_context * ctx = ggml_init(params);
805
4.91k
        ggml_free(ctx);
806
4.91k
    }
807
4.91k
}
808
809
0
void llama_numa_init(enum ggml_numa_strategy numa) {
810
0
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
811
0
        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
812
0
        GGML_ASSERT(dev && "CPU backend is not loaded");
813
0
        auto * reg = ggml_backend_dev_backend_reg(dev);
814
0
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
815
0
        if (numa_init_fn) {
816
0
            numa_init_fn(numa);
817
0
        }
818
0
    }
819
0
}
820
821
4.91k
void llama_backend_free(void) {
822
4.91k
    ggml_quantize_free();
823
4.91k
}
824
825
0
int64_t llama_time_us(void) {
826
0
    return ggml_time_us();
827
0
}
828
829
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
830
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
831
4.91k
        const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
832
    // loading time will be recalculated after the first eval, so
833
    // we take page faults deferred by mmap() into consideration
834
4.91k
    model.t_load_us = 0;
835
4.91k
    time_meas tm(model.t_load_us);
836
837
4.91k
    model.t_start_us = tm.t_start_us;
838
839
4.91k
    try {
840
4.91k
        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
841
4.91k
            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
842
843
4.91k
        ml.print_info();
844
845
4.91k
        model.hparams.vocab_only = params.vocab_only;
846
4.91k
        model.hparams.no_alloc   = params.no_alloc;
847
848
4.91k
        try {
849
4.91k
            model.load_arch(ml);
850
4.91k
        } catch(const std::exception & e) {
851
538
            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
852
538
        }
853
184
        try {
854
184
            model.load_hparams(ml);
855
184
        } catch(const std::exception & e) {
856
183
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
857
183
        }
858
1
        if (model.arch == LLM_ARCH_CLIP) {
859
1
            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
860
1
        }
861
0
        try {
862
0
            model.load_vocab(ml);
863
0
        } catch(const std::exception & e) {
864
0
            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
865
0
        }
866
867
0
        model.load_stats(ml);
868
0
        model.print_info();
869
870
0
        if (params.vocab_only) {
871
0
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
872
0
            return 0;
873
0
        }
874
875
0
        if (!model.load_tensors(ml)) {
876
0
            return -2;
877
0
        }
878
4.60k
    } catch (const std::exception & err) {
879
4.60k
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
880
4.60k
        return -1;
881
4.60k
    }
882
883
0
    return 0;
884
4.91k
}
885
886
static struct llama_model * llama_model_load_from_file_impl(
887
        struct gguf_context * metadata,
888
        llama_model_set_tensor_data_t set_tensor_data,
889
        void * set_tensor_data_ud,
890
        const std::string & path_model,
891
        std::vector<std::string> & splits,
892
4.91k
        struct llama_model_params params) {
893
4.91k
    GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
894
4.91k
    ggml_time_init();
895
896
4.91k
    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
897
0
        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
898
0
        return nullptr;
899
0
    }
900
901
4.91k
    unsigned cur_percentage = 0;
902
4.91k
    if (params.progress_callback == NULL) {
903
4.91k
        params.progress_callback_user_data = &cur_percentage;
904
4.91k
        params.progress_callback = [](float progress, void * ctx) {
905
0
            unsigned * cur_percentage_p = (unsigned *) ctx;
906
0
            unsigned percentage = (unsigned) (100 * progress);
907
0
            while (percentage > *cur_percentage_p) {
908
0
                *cur_percentage_p = percentage;
909
0
                LLAMA_LOG_CONT(".");
910
0
                if (percentage >= 100) {
911
0
                    LLAMA_LOG_CONT("\n");
912
0
                }
913
0
            }
914
0
            return true;
915
0
        };
916
4.91k
    }
917
918
4.91k
    llama_model * model = new llama_model(params);
919
920
    // create list of devices to use with this model
921
4.91k
    if (params.devices) {
922
0
        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
923
0
            model->devices.push_back(*dev);
924
0
        }
925
4.91k
    } else {
926
        // default device selection
927
928
        // build list of available devices
929
4.91k
        std::vector<ggml_backend_dev_t> gpus;
930
4.91k
        std::vector<ggml_backend_dev_t> igpus;
931
4.91k
        std::vector<ggml_backend_dev_t> rpc_servers;
932
933
9.83k
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
934
4.91k
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
935
4.91k
            switch (ggml_backend_dev_type(dev)) {
936
4.91k
                case GGML_BACKEND_DEVICE_TYPE_CPU:
937
4.91k
                case GGML_BACKEND_DEVICE_TYPE_ACCEL:
938
                    // skip CPU backends since they are handled separately
939
4.91k
                    break;
940
941
0
                case GGML_BACKEND_DEVICE_TYPE_GPU: {
942
0
                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
943
0
                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
944
0
                        rpc_servers.push_back(dev);
945
0
                    } else {
946
                        // check if there is already a GPU with the same device id
947
0
                        ggml_backend_dev_props props;
948
0
                        ggml_backend_dev_get_props(dev, &props);
949
0
                        auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
950
0
                            ggml_backend_dev_props d_props;
951
0
                            ggml_backend_dev_get_props(d, &d_props);
952
0
                            if (props.device_id && d_props.device_id) {
953
0
                                return strcmp(props.device_id, d_props.device_id) == 0;
954
0
                            }
955
0
                            return false;
956
0
                        });
957
958
0
                        if (it != gpus.end()) {
959
0
                            LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
960
0
                                    __func__,
961
0
                                    ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
962
0
                                    props.device_id ? props.device_id : "unknown id",
963
0
                                    ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
964
0
                        } else {
965
0
                            gpus.push_back(dev);
966
0
                        }
967
0
                    }
968
0
                    break;
969
4.91k
                }
970
971
0
                case GGML_BACKEND_DEVICE_TYPE_IGPU:
972
0
                    igpus.push_back(dev);
973
0
                    break;
974
4.91k
            }
975
4.91k
        }
976
977
        // add RPC servers at the front of the list to minimize network transfers
978
4.91k
        model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
979
980
        // add GPUs
981
4.91k
        model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
982
983
        // add integrated GPUs only if no other devices were found
984
4.91k
        if (model->devices.empty()) {
985
4.91k
            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
986
4.91k
        }
987
4.91k
    }
988
989
    // if using single GPU mode, remove all except the main GPU
990
4.91k
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
991
0
        if (params.main_gpu < 0) {
992
0
            model->devices.clear();
993
0
        } else {
994
0
            if (params.main_gpu >= (int)model->devices.size()) {
995
0
                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
996
0
                llama_model_free(model);
997
0
                return nullptr;
998
0
            }
999
0
            ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
1000
0
            model->devices.clear();
1001
0
            model->devices.push_back(main_gpu);
1002
0
        }
1003
0
    }
1004
1005
4.91k
    for (auto * dev : model->devices) {
1006
0
        ggml_backend_dev_props props;
1007
0
        ggml_backend_dev_get_props(dev, &props);
1008
0
        LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
1009
0
                ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
1010
0
                props.device_id ? props.device_id : "unknown id",
1011
0
                props.memory_free/1024/1024);
1012
0
    }
1013
1014
4.91k
    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
1015
4.91k
    GGML_ASSERT(status <= 0);
1016
4.91k
    if (status < 0) {
1017
4.60k
        if (status == -1) {
1018
4.60k
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
1019
4.60k
        } else if (status == -2) {
1020
0
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
1021
0
        }
1022
1023
4.60k
        llama_model_free(model);
1024
4.60k
        return nullptr;
1025
4.60k
    }
1026
1027
308
    return model;
1028
4.91k
}
1029
1030
struct llama_model * llama_model_init_from_user(
1031
        struct gguf_context * metadata,
1032
        llama_model_set_tensor_data_t set_tensor_data,
1033
        void * set_tensor_data_ud,
1034
0
        struct llama_model_params params) {
1035
0
    GGML_ASSERT(metadata != nullptr);
1036
0
    std::string path_model;
1037
0
    std::vector<std::string> splits = {};
1038
0
    params.use_mmap = false;
1039
0
    params.use_extra_bufts = false;
1040
0
    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
1041
0
}
1042
// deprecated
1043
struct llama_model * llama_load_model_from_file(
1044
        const char * path_model,
1045
4.91k
        struct llama_model_params params) {
1046
4.91k
    return llama_model_load_from_file(path_model, params);
1047
4.91k
}
1048
1049
struct llama_model * llama_model_load_from_file(
1050
        const char * path_model,
1051
4.91k
        struct llama_model_params params) {
1052
4.91k
    std::vector<std::string> splits = {};
1053
4.91k
    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
1054
4.91k
}
1055
1056
struct llama_model * llama_model_load_from_splits(
1057
        const char ** paths,
1058
        size_t n_paths,
1059
0
        struct llama_model_params params) {
1060
0
    std::vector<std::string> splits;
1061
0
    if (n_paths == 0) {
1062
0
        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
1063
0
        return nullptr;
1064
0
    }
1065
0
    splits.reserve(n_paths);
1066
0
    for (size_t i = 0; i < n_paths; ++i) {
1067
0
        splits.push_back(paths[i]);
1068
0
    }
1069
0
    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
1070
0
}
1071
1072
0
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
1073
0
    llama_model_saver ms(model);
1074
0
    ms.add_kv_from_model();
1075
0
    ms.add_tensors_from_model();
1076
0
    ms.save(path_model);
1077
0
}
1078
1079
//
1080
// chat templates
1081
//
1082
1083
int32_t llama_chat_apply_template(
1084
                              const char * tmpl,
1085
         const struct llama_chat_message * chat,
1086
                                  size_t   n_msg,
1087
                                    bool   add_ass,
1088
                                    char * buf,
1089
0
                                 int32_t   length) {
1090
0
    const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
1091
1092
    // format the chat to string
1093
0
    std::vector<const llama_chat_message *> chat_vec;
1094
0
    chat_vec.resize(n_msg);
1095
0
    for (size_t i = 0; i < n_msg; i++) {
1096
0
        chat_vec[i] = &chat[i];
1097
0
    }
1098
1099
0
    std::string formatted_chat;
1100
0
    llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
1101
0
    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
1102
0
        return -1;
1103
0
    }
1104
0
    int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
1105
0
    if (res < 0) {
1106
0
        return res;
1107
0
    }
1108
0
    if (buf && length > 0) {
1109
0
        strncpy(buf, formatted_chat.c_str(), length);
1110
0
    }
1111
0
    return res;
1112
0
}
1113
1114
//
1115
// model split
1116
//
1117
1118
int32_t llama_split_path(
1119
    char * split_path,
1120
    size_t maxlen,
1121
    const char * path_prefix,
1122
    int32_t split_no,
1123
0
    int32_t split_count) {
1124
1125
0
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
1126
1127
0
    const int written = snprintf(
1128
0
        split_path,
1129
0
        maxlen,
1130
0
        SPLIT_PATH_FORMAT,
1131
0
        path_prefix,
1132
0
        split_no + 1,
1133
0
        split_count
1134
0
    );
1135
1136
0
    if (written < 0 || (size_t) written >= maxlen) {
1137
0
        return 0;
1138
0
    }
1139
1140
0
    return (int32_t) written;
1141
0
}
1142
1143
int32_t llama_split_prefix(
1144
    char * split_prefix,
1145
    size_t maxlen,
1146
    const char * split_path,
1147
    int32_t split_no,
1148
0
    int32_t split_count) {
1149
1150
0
    const std::string str_split_path(split_path);
1151
1152
0
    char postfix[32];
1153
0
    snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
1154
1155
0
    const std::string str_postfix(postfix);
1156
0
    if (str_split_path.size() <= str_postfix.size()) {
1157
0
        return 0;
1158
0
    }
1159
1160
0
    const size_t size_prefix = str_split_path.size() - str_postfix.size();
1161
1162
0
    if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
1163
0
        const size_t copy_len = std::min(size_prefix + 1, maxlen);
1164
0
        snprintf(split_prefix, copy_len, "%s", split_path);
1165
1166
0
        return (int32_t) size_prefix;
1167
0
    }
1168
1169
0
    return 0;
1170
0
}
1171
1172
0
const char * llama_print_system_info(void) {
1173
0
    static std::string s;
1174
0
    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
1175
1176
0
    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
1177
0
        auto * reg = ggml_backend_reg_get(i);
1178
0
        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
1179
0
        if (get_features_fn) {
1180
0
            ggml_backend_feature * features = get_features_fn(reg);
1181
0
            s += ggml_backend_reg_name(reg);
1182
0
            s += " : ";
1183
0
            for (; features->name; features++) {
1184
0
                s += features->name;
1185
0
                s += " = ";
1186
0
                s += features->value;
1187
0
                s += " | ";
1188
0
            }
1189
0
        }
1190
0
    }
1191
1192
0
    return s.c_str();
1193
0
}
1194