Coverage Report

Created: 2026-06-13 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/fit.cpp
Line
Count
Source
1
#include "fit.h"
2
3
#include "log.h"
4
5
#include "../src/llama-ext.h"
6
7
#include <array>
8
#include <cassert>
9
#include <stdexcept>
10
#include <cinttypes>
11
#include <set>
12
#include <string>
13
#include <vector>
14
15
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
16
// enum to identify part of a layer for distributing its tensors:
17
enum common_layer_fraction_t {
18
    LAYER_FRACTION_NONE = 0, // nothing
19
    LAYER_FRACTION_ATTN = 1, // attention
20
    LAYER_FRACTION_UP   = 2, // attention + up
21
    LAYER_FRACTION_GATE = 3, // attention + up + gate
22
    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
23
};
24
25
class common_params_fit_exception : public std::runtime_error {
26
    using std::runtime_error::runtime_error;
27
};
28
29
static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
30
        const char * path_model,
31
        const llama_model_params * mparams,
32
        const llama_context_params * cparams,
33
        std::vector<ggml_backend_dev_t> & devs,
34
        uint32_t & hp_ngl,
35
        uint32_t & hp_n_ctx_train,
36
        uint32_t & hp_n_expert,
37
0
        ggml_log_level log_level) {
38
0
    struct user_data_t {
39
0
        struct {
40
0
            ggml_log_callback callback;
41
0
            void * user_data;
42
0
        } original_logger;
43
0
        ggml_log_level min_level; // prints below this log level go to debug log
44
0
    };
45
0
    user_data_t ud;
46
0
    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
47
0
    ud.min_level = log_level;
48
49
0
    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
50
0
        const user_data_t * ud = (const user_data_t *) user_data;
51
0
        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
52
0
        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
53
0
    }, &ud);
54
55
0
    llama_model_params mparams_copy = *mparams;
56
0
    mparams_copy.no_alloc  = true;
57
0
    mparams_copy.use_mmap  = false;
58
0
    mparams_copy.use_mlock = false;
59
60
0
    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
61
0
    if (model == nullptr) {
62
0
        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
63
0
        throw std::runtime_error("failed to load model");
64
0
    }
65
66
0
    llama_context * ctx = llama_init_from_model(model, *cparams);
67
0
    if (ctx == nullptr) {
68
0
        llama_model_free(model);
69
0
        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
70
0
        throw std::runtime_error("failed to create llama_context from model");
71
0
    }
72
73
0
    const size_t nd = llama_model_n_devices(model);
74
0
    std::vector<llama_device_memory_data> ret(nd + 1);
75
76
0
    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
77
78
0
    for (const auto & [buft, mb] : memory_breakdown) {
79
0
        if (ggml_backend_buft_is_host(buft)) {
80
0
            ret.back().mb.model   += mb.model;
81
0
            ret.back().mb.context += mb.context;
82
0
            ret.back().mb.compute += mb.compute;
83
0
            continue;
84
0
        }
85
86
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
87
0
        if (!dev) {
88
0
            continue;
89
0
        }
90
0
        for (size_t i = 0; i < nd; i++) {
91
0
            if (dev == llama_model_get_device(model, i)) {
92
0
                ret[i].mb.model   += mb.model;
93
0
                ret[i].mb.context += mb.context;
94
0
                ret[i].mb.compute += mb.compute;
95
0
                break;
96
0
            }
97
0
        }
98
0
    }
99
100
0
    {
101
0
        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
102
0
        if (cpu_dev == nullptr) {
103
0
            throw std::runtime_error("no CPU backend found");
104
0
        }
105
0
        size_t free;
106
0
        size_t total;
107
0
        ggml_backend_dev_memory(cpu_dev, &free, &total);
108
0
        ret.back().free  = free;
109
0
        ret.back().total = total;
110
0
    }
111
0
    for (size_t i = 0; i < nd; i++) {
112
0
        ggml_backend_dev_t dev = llama_model_get_device(model, i);
113
114
0
        size_t free;
115
0
        size_t total;
116
0
        ggml_backend_dev_memory(dev, &free, &total);
117
118
        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
119
        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
120
        // not assign anything to a device with an unknown memory budget.
121
0
        if (free == 0 && total == 0) {
122
0
            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
123
0
            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
124
0
                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
125
0
                        __func__, ggml_backend_dev_name(dev));
126
0
            } else {
127
0
                free  = ret.back().free;
128
0
                total = ret.back().total;
129
0
            }
130
0
        }
131
0
        ret[i].free  = free;
132
0
        ret[i].total = total;
133
0
    }
134
135
0
    devs.clear();
136
0
    for (int i = 0; i < llama_model_n_devices(model); i++) {
137
0
        devs.push_back(llama_model_get_device(model, i));
138
0
    }
139
140
0
    hp_ngl         = llama_model_n_layer(model);
141
0
    hp_n_ctx_train = llama_model_n_ctx_train(model);
142
0
    hp_n_expert    = llama_model_n_expert(model);
143
144
0
    common_memory_breakdown_print(ctx);
145
146
0
    llama_free(ctx);
147
0
    llama_model_free(model);
148
0
    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
149
150
0
    return ret;
151
0
}
152
153
common_device_memory_data_vec common_get_device_memory_data(
154
        const char * path_model,
155
        const llama_model_params * mparams,
156
        const llama_context_params * cparams,
157
        std::vector<ggml_backend_dev_t> & devs,
158
        uint32_t & hp_ngl,
159
        uint32_t & hp_n_ctx_train,
160
        uint32_t & hp_n_expert,
161
0
        ggml_log_level log_level) {
162
0
    std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
163
0
            path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
164
165
0
    common_device_memory_data_vec ret(impl.size());
166
0
    for (size_t i = 0; i < impl.size(); i++) {
167
0
        ret[i].total   = impl[i].total;
168
0
        ret[i].free    = impl[i].free;
169
0
        ret[i].model   = impl[i].mb.model;
170
0
        ret[i].context = impl[i].mb.context;
171
0
        ret[i].compute = impl[i].mb.compute;
172
0
    }
173
0
    return ret;
174
0
}
175
176
static void common_params_fit_impl(
177
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
178
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
179
0
        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
180
0
    if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
181
0
        throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
182
0
    }
183
0
    constexpr int64_t MiB = 1024*1024;
184
0
    typedef std::vector<llama_device_memory_data> dmds_t;
185
0
    const llama_model_params default_mparams = llama_model_default_params();
186
187
0
    std::vector<ggml_backend_dev_t> devs;
188
0
    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
189
0
    uint32_t hp_nct = 0; // hparams.n_ctx_train
190
0
    uint32_t hp_nex = 0; // hparams.n_expert
191
192
    // step 1: get data for default parameters and check whether any changes are necessary in the first place
193
194
0
    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
195
0
    const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
196
0
    const size_t nd = devs.size(); // number of devices
197
198
0
    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
199
0
    margins.reserve(nd);
200
0
    if (nd == 0) {
201
0
        margins.push_back(margins_s[0]);
202
0
    } else {
203
0
        for (size_t id = 0; id < nd; id++) {
204
0
            margins.push_back(margins_s[id]);
205
0
        }
206
0
    }
207
208
0
    std::vector<std::string> dev_names;
209
0
    {
210
0
        dev_names.reserve(nd);
211
0
        size_t max_length = 0;
212
0
        for (const auto & dev : devs) {
213
0
            std::string name = ggml_backend_dev_name(dev);
214
0
            name += " (";
215
0
            name += ggml_backend_dev_description(dev);
216
0
            name += ")";
217
0
            dev_names.push_back(name);
218
0
            max_length = std::max(max_length, name.length());
219
0
        }
220
0
        for (std::string & dn : dev_names) {
221
0
            dn.insert(dn.end(), max_length - dn.length(), ' ');
222
0
        }
223
0
    }
224
225
0
    int64_t sum_free            = 0;
226
0
    int64_t sum_projected_free  = 0;
227
0
    int64_t sum_projected_used  = 0;
228
0
    int64_t sum_projected_model = 0;
229
0
    std::vector<int64_t> projected_free_per_device;
230
0
    projected_free_per_device.reserve(nd);
231
232
0
    if (nd == 0) {
233
0
        sum_projected_used = dmds_full.back().mb.total();
234
0
        sum_free           = dmds_full.back().total;
235
0
        sum_projected_free = sum_free - sum_projected_used;
236
0
        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
237
0
            __func__, sum_projected_used/MiB, sum_free/MiB);
238
0
        if (sum_projected_free >= margins[0]) {
239
0
            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
240
0
                __func__, sum_projected_free/MiB, margins[0]/MiB);
241
0
            return;
242
0
        }
243
0
    } else {
244
0
        if (nd > 1) {
245
0
            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
246
0
        }
247
0
        for (size_t id = 0; id < nd; id++) {
248
0
            const llama_device_memory_data & dmd = dmds_full[id];
249
250
0
            const int64_t projected_used = dmd.mb.total();
251
0
            const int64_t projected_free = dmd.free - projected_used;
252
0
            projected_free_per_device.push_back(projected_free);
253
254
0
            sum_free            += dmd.free;
255
0
            sum_projected_used  += projected_used;
256
0
            sum_projected_free  += projected_free;
257
0
            sum_projected_model += dmd.mb.model;
258
259
0
            if (nd > 1) {
260
0
                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
261
0
                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
262
0
            }
263
0
        }
264
0
        assert(sum_free >= 0 && sum_projected_used >= 0);
265
0
        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
266
0
            __func__, sum_projected_used/MiB, sum_free/MiB);
267
0
        if (nd == 1) {
268
0
            if (projected_free_per_device[0] >= margins[0]) {
269
0
                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
270
0
                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
271
0
                return;
272
0
            }
273
0
        } else {
274
0
            bool changes_needed = false;
275
0
            for (size_t id = 0; id < nd; id++) {
276
0
                if (projected_free_per_device[id] < margins[id]) {
277
0
                    changes_needed = true;
278
0
                    break;
279
0
                }
280
0
            }
281
0
            if (!changes_needed) {
282
0
                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
283
0
                return;
284
0
            }
285
0
        }
286
0
    }
287
288
    // step 2: try reducing memory use by reducing the context size
289
290
0
    {
291
0
        int64_t global_surplus = sum_projected_free;
292
0
        if (nd == 0) {
293
0
            global_surplus -= margins[0];
294
0
        } else {
295
0
            for (size_t id = 0; id < nd; id++) {
296
0
                global_surplus -= margins[id];
297
0
            }
298
0
        }
299
0
        if (global_surplus < 0) {
300
0
            if (nd <= 1) {
301
0
                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
302
0
                    __func__, margins[0]/MiB, -global_surplus/MiB);
303
0
            } else {
304
0
                LOG_TRC(
305
0
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
306
0
                    __func__, -global_surplus/MiB);
307
0
            }
308
0
            if (cparams->n_ctx == 0) {
309
0
                if (hp_nct > n_ctx_min) {
310
0
                    int64_t sum_used_target = sum_free;
311
0
                    if (nd == 0) {
312
0
                        sum_used_target -= margins[0];
313
0
                    } else {
314
0
                        for (size_t id = 0; id < nd; id++) {
315
0
                            sum_used_target -= margins[id];
316
0
                        }
317
0
                    }
318
0
                    if (nd > 1) {
319
                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
320
                        //   - for dense models only whole layers can be assigned to devices
321
                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
322
                        //   - on average we expect a waste of 0.5 layers/tensors per device
323
                        //   - use slightly more than the expected average for nd devices to be safe
324
0
                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
325
0
                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
326
0
                    }
327
328
0
                    int64_t sum_projected_used_min_ctx = 0;
329
0
                    cparams->n_ctx = n_ctx_min;
330
0
                    const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
331
0
                    if (nd == 0) {
332
0
                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
333
0
                    } else {
334
0
                        for (size_t id = 0; id < nd; id++) {
335
0
                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
336
0
                        }
337
0
                    }
338
0
                    if (sum_used_target > sum_projected_used_min_ctx) {
339
                        // linear interpolation between minimum and maximum context size:
340
0
                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
341
0
                            / (sum_projected_used - sum_projected_used_min_ctx);
342
0
                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
343
344
0
                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
345
0
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
346
0
                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
347
0
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
348
0
                        if (nd <= 1) {
349
0
                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
350
0
                            return;
351
0
                        }
352
0
                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
353
0
                    } else {
354
0
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
355
0
                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
356
0
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
357
0
                    }
358
0
                } else {
359
0
                    if (n_ctx_min == UINT32_MAX) {
360
0
                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
361
0
                    } else {
362
0
                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
363
0
                            __func__, hp_nct, n_ctx_min);
364
0
                    }
365
0
                }
366
0
            } else {
367
0
                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
368
0
            }
369
0
        }
370
0
    }
371
0
    if (nd == 0) {
372
0
        throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
373
0
    }
374
375
0
    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
376
0
        throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
377
0
    }
378
0
    if (nd > 1) {
379
0
        if (!tensor_split) {
380
0
            throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
381
0
        }
382
0
        if (mparams->tensor_split) {
383
0
            for (size_t id = 0; id < nd; id++) {
384
0
                if (mparams->tensor_split[id] != 0.0f) {
385
0
                    throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
386
0
                }
387
0
            }
388
0
        }
389
0
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
390
0
            throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
391
0
        }
392
0
    }
393
0
    if (!tensor_buft_overrides) {
394
0
        throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
395
0
    }
396
0
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
397
0
        throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
398
0
    }
399
400
    // step 3: iteratively fill the back to front with "dense" layers
401
    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
402
    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
403
404
    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
405
0
    auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
406
0
        constexpr size_t n_strings = 1000;
407
0
        if (il >= n_strings) {
408
0
            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
409
0
        }
410
0
        switch (lf) {
411
0
            case LAYER_FRACTION_ATTN: {
412
0
                static std::array<std::string, n_strings> patterns;
413
0
                if (patterns[il].empty()) {
414
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
415
0
                }
416
0
                return patterns[il].c_str();
417
0
            }
418
0
            case LAYER_FRACTION_UP: {
419
0
                static std::array<std::string, n_strings> patterns;
420
0
                if (patterns[il].empty()) {
421
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
422
0
                }
423
0
                return patterns[il].c_str();
424
0
            }
425
0
            case LAYER_FRACTION_GATE: {
426
0
                static std::array<std::string, n_strings> patterns;
427
0
                if (patterns[il].empty()) {
428
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
429
0
                }
430
0
                return patterns[il].c_str();
431
0
            }
432
0
            case LAYER_FRACTION_MOE: {
433
0
                static std::array<std::string, n_strings> patterns;
434
0
                if (patterns[il].empty()) {
435
0
                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
436
0
                }
437
0
                return patterns[il].c_str();
438
0
            }
439
0
            default:
440
0
                GGML_ABORT("fatal error");
441
0
        }
442
0
    };
443
444
0
    struct ngl_t {
445
0
        uint32_t n_layer = 0; // number of total layers
446
0
        uint32_t n_part  = 0; // number of partial layers, <= n_layer
447
448
        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
449
0
        common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
450
451
0
        uint32_t n_full() const {
452
0
            assert(n_layer >= n_part);
453
0
            return n_layer - n_part;
454
0
        }
455
0
    };
456
457
0
    const size_t ntbo = llama_max_tensor_buft_overrides();
458
459
    // utility function to set n_gpu_layers and tensor_split
460
0
    auto set_ngl_tensor_split_tbo = [&](
461
0
            const std::vector<ngl_t> & ngl_per_device,
462
0
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
463
0
            llama_model_params & mparams) {
464
0
        mparams.n_gpu_layers = 0;
465
0
        for (size_t id = 0; id < nd; id++) {
466
0
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
467
0
            if (nd > 1) {
468
0
                tensor_split[id] = ngl_per_device[id].n_layer;
469
0
            }
470
0
        }
471
0
        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
472
0
        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
473
474
0
        mparams.tensor_split = tensor_split;
475
476
0
        size_t itbo = 0;
477
0
        for (size_t id = 0; id < nd; id++) {
478
0
            il0 += ngl_per_device[id].n_full();
479
0
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
480
0
                if (itbo + 1 >= ntbo) {
481
0
                    tensor_buft_overrides[itbo].pattern = nullptr;
482
0
                    tensor_buft_overrides[itbo].buft    = nullptr;
483
0
                    itbo++;
484
0
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
485
0
                    throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
486
0
                        + std::to_string(ntbo) + " is insufficient for model");
487
0
                }
488
0
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
489
0
                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
490
0
                itbo++;
491
0
            }
492
0
            il0 += ngl_per_device[id].n_part;
493
0
        }
494
0
        tensor_buft_overrides[itbo].pattern = nullptr;
495
0
        tensor_buft_overrides[itbo].buft    = nullptr;
496
0
        itbo++;
497
0
        mparams.tensor_buft_overrides = tensor_buft_overrides;
498
0
    };
499
500
    // utility function that returns the memory use per device for given numbers of layers per device
501
0
    auto get_memory_for_layers = [&](
502
0
            const char * func_name,
503
0
            const std::vector<ngl_t> & ngl_per_device,
504
0
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
505
0
        llama_model_params mparams_copy = *mparams;
506
0
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
507
508
0
        const dmds_t dmd_nl = common_get_device_memory_data_impl(
509
0
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
510
511
0
        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
512
0
        for (size_t id = 0; id < nd; id++) {
513
0
            const ngl_t & n = ngl_per_device[id];
514
0
            LOG_TRC(
515
0
                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
516
0
                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
517
0
        }
518
519
0
        std::vector<int64_t> ret;
520
0
        ret.reserve(nd);
521
0
        for (size_t id = 0; id < nd; id++) {
522
0
            ret.push_back(dmd_nl[id].mb.total());
523
0
        }
524
0
        return ret;
525
0
    };
526
527
0
    int64_t global_surplus_cpu_moe = 0;
528
0
    if (hp_nex > 0) {
529
0
        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
530
0
        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
531
0
        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
532
0
        tensor_buft_overrides[1] = {nullptr, nullptr};
533
0
        mparams->tensor_buft_overrides = tensor_buft_overrides;
534
535
0
        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
536
0
        const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
537
0
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
538
539
0
        for (size_t id = 0; id < nd; id++) {
540
0
            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
541
0
            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
542
0
        }
543
544
0
        if (global_surplus_cpu_moe > 0) {
545
0
            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
546
0
                __func__, global_surplus_cpu_moe/MiB);
547
0
        } else {
548
0
            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
549
0
                __func__, -global_surplus_cpu_moe/MiB);
550
0
        }
551
552
        // reset
553
0
        tensor_buft_overrides[0] = {nullptr, nullptr};
554
0
        mparams->tensor_buft_overrides = tensor_buft_overrides;
555
0
    }
556
557
0
    std::vector<int64_t> targets; // maximum acceptable memory use per device
558
0
    targets.reserve(nd);
559
0
    for (size_t id = 0; id < nd; id++) {
560
0
        targets.push_back(dmds_full[id].free - margins[id]);
561
0
        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
562
0
    }
563
564
0
    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
565
0
    overflow_bufts.reserve(nd);
566
0
    for (size_t id = 0; id < nd; id++) {
567
0
        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
568
0
    }
569
570
0
    std::vector<ngl_t> ngl_per_device(nd);
571
0
    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
572
573
    // optimize the number of layers per device using the method of false position:
574
    //   - ngl_per_device has 0 layers for each device, lower bound
575
    //   - try a "high" configuration where a device is given all unassigned layers
576
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
577
    //   - check memory use of our guess, replace either the low or high bound
578
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
579
    //   - the last device has the output layer, which cannot be a partial layer
580
0
    if (hp_nex == 0) {
581
0
        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
582
0
    } else {
583
0
        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
584
0
    }
585
0
    for (int id = nd - 1; id >= 0; id--) {
586
0
        uint32_t n_unassigned = hp_ngl + 1;
587
0
        for (size_t jd = id + 1; jd < nd; ++jd) {
588
0
            assert(n_unassigned >= ngl_per_device[jd].n_layer);
589
0
            n_unassigned -= ngl_per_device[jd].n_layer;
590
0
        }
591
592
0
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
593
0
        ngl_per_device_high[id].n_layer = n_unassigned;
594
0
        if (hp_nex > 0) {
595
0
            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
596
0
        }
597
0
        if (ngl_per_device_high[id].n_layer > 0) {
598
0
            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
599
0
            if (mem_high[id] > targets[id]) {
600
0
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
601
0
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
602
0
                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
603
0
                while (delta > 1) {
604
0
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
605
0
                    step_size = std::max(step_size, uint32_t(1));
606
0
                    step_size = std::min(step_size, delta - 1);
607
608
0
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
609
0
                    ngl_per_device_test[id].n_layer += step_size;
610
0
                    if (hp_nex) {
611
0
                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
612
0
                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
613
0
                    }
614
0
                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
615
616
0
                    if (mem_test[id] <= targets[id]) {
617
0
                        ngl_per_device = ngl_per_device_test;
618
0
                        mem            = mem_test;
619
0
                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
620
0
                    } else {
621
0
                        ngl_per_device_high = ngl_per_device_test;
622
0
                        mem_high            = mem_test;
623
0
                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
624
0
                    }
625
0
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
626
0
                }
627
0
            } else {
628
0
                assert(ngl_per_device_high[id].n_layer == n_unassigned);
629
0
                ngl_per_device = ngl_per_device_high;
630
0
                mem            = mem_high;
631
0
                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
632
0
            }
633
0
        }
634
635
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
636
0
        LOG_TRC(
637
0
            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
638
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
639
0
    }
640
0
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
641
0
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
642
0
        return;
643
0
    }
644
645
    // step 4: for a MoE model where all dense tensors fit,
646
    //     convert the dense-only layers in the back to full layers in the front until all devices are full
647
    // essentially the same procedure as for the dense-only layers except front-to-back
648
    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
649
650
0
    size_t id_dense_start = nd;
651
0
    for (int id = nd - 1; id >= 0; id--) {
652
0
        if (ngl_per_device[id].n_layer > 0) {
653
0
            id_dense_start = id;
654
0
            continue;
655
0
        }
656
0
        break;
657
0
    }
658
0
    assert(id_dense_start < nd);
659
660
0
    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
661
0
    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
662
0
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
663
0
        for (size_t jd = id_dense_start; jd < nd; jd++) {
664
0
            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
665
0
            ngl_per_device_high[id].n_layer += n_layer_move;
666
0
            ngl_per_device_high[jd].n_layer -= n_layer_move;
667
0
            ngl_per_device_high[jd].n_part = 0;
668
0
        }
669
0
        size_t id_dense_start_high = nd - 1;
670
0
        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
671
672
0
        if (mem_high[id] > targets[id]) {
673
0
            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
674
0
            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
675
0
            while (delta > 1) {
676
0
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
677
0
                step_size = std::max(step_size, uint32_t(1));
678
0
                step_size = std::min(step_size, delta - 1);
679
680
0
                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
681
0
                size_t id_dense_start_test = id_dense_start;
682
0
                uint32_t n_converted_test = 0;
683
0
                for (;id_dense_start_test < nd; id_dense_start_test++) {
684
0
                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
685
0
                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
686
0
                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
687
0
                    ngl_per_device_test[id].n_layer += n_convert_jd;
688
0
                    n_converted_test += n_convert_jd;
689
690
0
                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
691
0
                        break;
692
0
                    }
693
0
                }
694
0
                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
695
696
0
                if (mem_test[id] <= targets[id]) {
697
0
                    ngl_per_device = ngl_per_device_test;
698
0
                    mem            = mem_test;
699
0
                    id_dense_start = id_dense_start_test;
700
0
                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
701
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
702
0
                } else {
703
0
                    ngl_per_device_high = ngl_per_device_test;
704
0
                    mem_high            = mem_test;
705
0
                    id_dense_start_high = id_dense_start_test;
706
0
                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
707
0
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
708
0
                }
709
0
                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
710
0
                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
711
0
            }
712
0
        } else {
713
0
            ngl_per_device = ngl_per_device_high;
714
0
            mem            = mem_high;
715
0
            id_dense_start = id_dense_start_high;
716
0
            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
717
0
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
718
0
        }
719
720
        // try to fit at least part of one more layer
721
0
        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
722
0
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
723
0
            size_t id_dense_start_test = id_dense_start;
724
0
            ngl_per_device_test[id_dense_start_test].n_layer--;
725
0
            ngl_per_device_test[id_dense_start_test].n_part--;
726
0
            ngl_per_device_test[id].n_layer++;
727
0
            ngl_per_device_test[id].n_part++;
728
0
            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
729
0
                id_dense_start_test++;
730
0
            }
731
0
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
732
0
            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
733
0
            if (id < nd - 1) {
734
0
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
735
0
            }
736
0
            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
737
0
            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
738
0
            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
739
0
                ngl_per_device = ngl_per_device_test;
740
0
                overflow_bufts = overflow_bufts_test;
741
0
                mem            = mem_test;
742
0
                id_dense_start = id_dense_start_test;
743
0
                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
744
0
                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
745
746
0
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
747
0
                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
748
0
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
749
0
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
750
0
                    ngl_per_device = ngl_per_device_test;
751
0
                    overflow_bufts = overflow_bufts_test;
752
0
                    mem            = mem_test;
753
0
                    id_dense_start = id_dense_start_test;
754
0
                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
755
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
756
0
                }
757
0
            } else {
758
0
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
759
0
                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
760
0
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
761
0
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
762
0
                    ngl_per_device = ngl_per_device_test;
763
0
                    overflow_bufts = overflow_bufts_test;
764
0
                    mem            = mem_test;
765
0
                    id_dense_start = id_dense_start_test;
766
0
                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
767
0
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
768
0
                }
769
0
            }
770
0
        }
771
772
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
773
0
        LOG_TRC(
774
0
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
775
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
776
0
    }
777
778
    // print info for devices that were not changed during the conversion from dense only to full layers:
779
0
    for (size_t id = id_dense_start + 1; id < nd; id++) {
780
0
        const int64_t projected_margin = dmds_full[id].free - mem[id];
781
0
        LOG_TRC(
782
0
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
783
0
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
784
0
    }
785
786
0
    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
787
0
}
788
789
enum common_params_fit_status common_fit_params(
790
        const char * path_model,
791
        llama_model_params * mparams,
792
        llama_context_params * cparams,
793
        float * tensor_split,
794
        llama_model_tensor_buft_override * tensor_buft_overrides,
795
        size_t * margins,
796
        uint32_t n_ctx_min,
797
0
        ggml_log_level log_level) {
798
0
    const int64_t t0_us = llama_time_us();
799
0
    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
800
0
    try {
801
0
        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
802
0
        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
803
0
    } catch (const common_params_fit_exception & e) {
804
0
        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
805
0
        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
806
0
    } catch (const std::runtime_error & e) {
807
0
        LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
808
0
        status = COMMON_PARAMS_FIT_STATUS_ERROR;
809
0
    }
810
0
    const int64_t t1_us = llama_time_us();
811
0
    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
812
0
    return status;
813
0
}
814
815
0
void common_memory_breakdown_print(const struct llama_context * ctx) {
816
    //const auto & devices = ctx->get_model().devices;
817
0
    const auto * model = llama_get_model(ctx);
818
819
0
    std::vector<ggml_backend_dev_t> devices;
820
0
    for (int i = 0; i < llama_model_n_devices(model); i++) {
821
0
        devices.push_back(llama_model_get_device(model, i));
822
0
    }
823
824
0
    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
825
826
0
    std::vector<std::array<std::string, 9>> table_data;
827
0
    table_data.reserve(devices.size());
828
0
    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
829
0
    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
830
0
    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
831
832
0
    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
833
834
0
    constexpr size_t MiB = 1024 * 1024;
835
0
    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
836
837
    // track seen buffer types to avoid double counting:
838
0
    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
839
840
    // accumulative memory breakdown for each device and for host:
841
0
    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
842
0
    llama_memory_breakdown_data              mb_host;
843
844
0
    for (const auto & buft_mb : memory_breakdown) {
845
0
        ggml_backend_buffer_type_t          buft = buft_mb.first;
846
0
        const llama_memory_breakdown_data & mb   = buft_mb.second;
847
0
        if (ggml_backend_buft_is_host(buft)) {
848
0
            mb_host.model   += mb.model;
849
0
            mb_host.context += mb.context;
850
0
            mb_host.compute += mb.compute;
851
0
            seen_buffer_types.insert(buft);
852
0
            continue;
853
0
        }
854
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
855
0
        if (dev) {
856
0
            int i_dev = -1;
857
0
            for (size_t i = 0; i < devices.size(); i++) {
858
0
                if (devices[i] == dev) {
859
0
                    i_dev = i;
860
0
                    break;
861
0
                }
862
0
            }
863
0
            if (i_dev != -1) {
864
0
                mb_dev[i_dev].model   += mb.model;
865
0
                mb_dev[i_dev].context += mb.context;
866
0
                mb_dev[i_dev].compute += mb.compute;
867
0
                seen_buffer_types.insert(buft);
868
0
                continue;
869
0
            }
870
0
        }
871
0
    }
872
873
    // print memory breakdown for each device:
874
0
    for (size_t i = 0; i < devices.size(); i++) {
875
0
        ggml_backend_dev_t dev = devices[i];
876
0
        llama_memory_breakdown_data mb = mb_dev[i];
877
878
0
        const std::string name = ggml_backend_dev_name(dev);
879
0
        std::string desc = ggml_backend_dev_description(dev);
880
0
        for (const std::string & prefix : desc_prefixes_strip) {
881
0
            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
882
0
                desc = desc.substr(prefix.length());
883
0
            }
884
0
        }
885
886
0
        size_t free, total;
887
0
        ggml_backend_dev_memory(dev, &free, &total);
888
889
0
        const size_t self = mb.model + mb.context + mb.compute;
890
0
        const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
891
892
0
        table_data.push_back({
893
0
            template_gpu,
894
0
            "  - " + name + " (" + desc + ")",
895
0
            std::to_string(total / MiB),
896
0
            std::to_string(free / MiB),
897
0
            std::to_string(self / MiB),
898
0
            std::to_string(mb.model / MiB),
899
0
            std::to_string(mb.context / MiB),
900
0
            std::to_string(mb.compute / MiB),
901
0
            std::to_string(unaccounted / static_cast<int64_t>(MiB))});
902
0
    }
903
904
    // print memory breakdown for host:
905
0
    {
906
0
        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
907
0
        table_data.push_back({
908
0
            template_other,
909
0
            "  - Host",
910
0
            "", // total
911
0
            "", // free
912
0
            std::to_string(self / MiB),
913
0
            std::to_string(mb_host.model / MiB),
914
0
            std::to_string(mb_host.context / MiB),
915
0
            std::to_string(mb_host.compute / MiB),
916
0
            ""}); // unaccounted
917
0
    }
918
919
    // print memory breakdown for all remaining buffer types:
920
0
    for (const auto & buft_mb : memory_breakdown) {
921
0
        ggml_backend_buffer_type_t          buft = buft_mb.first;
922
0
        const llama_memory_breakdown_data & mb   = buft_mb.second;
923
0
        if (seen_buffer_types.count(buft) == 1) {
924
0
            continue;
925
0
        }
926
0
        const std::string name = ggml_backend_buft_name(buft);
927
0
        const size_t self = mb.model + mb.context + mb.compute;
928
0
        table_data.push_back({
929
0
            template_other,
930
0
            "  - " + name,
931
0
            "", // total
932
0
            "", // free
933
0
            std::to_string(self / MiB),
934
0
            std::to_string(mb.model / MiB),
935
0
            std::to_string(mb.context / MiB),
936
0
            std::to_string(mb.compute / MiB),
937
0
            ""}); // unaccounted
938
0
        seen_buffer_types.insert(buft);
939
0
    }
940
941
0
    for (size_t j = 1; j < table_data[0].size(); j++) {
942
0
        size_t max_len = 0;
943
0
        for (const auto & td : table_data) {
944
0
            max_len = std::max(max_len, td[j].length());
945
0
        }
946
0
        for (auto & td : table_data) {
947
0
            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
948
0
        }
949
0
    }
950
0
    for (const auto & td : table_data) {
951
0
        LOG_TRC(td[0].c_str(),
952
0
            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
953
0
            td[6].c_str(), td[7].c_str(), td[8].c_str());
954
0
    }
955
0
}
956
957
void common_fit_print(
958
        const char * path_model,
959
        llama_model_params * mparams,
960
0
        llama_context_params * cparams) {
961
0
    std::vector<ggml_backend_dev_t> devs;
962
0
    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
963
0
    uint32_t hp_nct = 0; // hparams.n_ctx_train
964
0
    uint32_t hp_nex = 0; // hparams.n_expert
965
966
0
    auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
967
0
    GGML_ASSERT(dmd.size() == devs.size() + 1);
968
969
0
    for (size_t id = 0; id < devs.size(); id++) {
970
0
        printf("%s ",  ggml_backend_dev_name(devs[id]));
971
0
        printf("%zu ", dmd[id].mb.model/1024/1024);
972
0
        printf("%zu ", dmd[id].mb.context/1024/1024);
973
0
        printf("%zu ", dmd[id].mb.compute/1024/1024);
974
0
        printf("\n");
975
0
    }
976
977
0
    printf("Host ");
978
0
    printf("%zu ", dmd.back().mb.model/1024/1024);
979
0
    printf("%zu ", dmd.back().mb.context/1024/1024);
980
0
    printf("%zu ", dmd.back().mb.compute/1024/1024);
981
0
    printf("\n");
982
0
}