Coverage Report

Created: 2025-11-28 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama.cpp
Line
Count
Source
1
#include "llama-impl.h"
2
3
#include "llama-chat.h"
4
#include "llama-mmap.h"
5
#include "llama-vocab.h"
6
#include "llama-model-loader.h"
7
#include "llama-model-saver.h"
8
#include "llama-model.h"
9
10
#include "ggml.h"
11
#include "ggml-backend.h"
12
13
#include <algorithm>
14
#include <cstddef>
15
#include <cstdint>
16
#include <cstdio>
17
#include <cstring>
18
#include <ctime>
19
20
#if defined(_MSC_VER)
21
#pragma warning(disable: 4244 4267) // possible loss of data
22
#endif
23
24
//
25
// interface implementation
26
//
27
28
0
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29
0
    switch (flash_attn_type) {
30
0
        case LLAMA_FLASH_ATTN_TYPE_AUTO:
31
0
            return "auto";
32
0
        case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33
0
            return "disabled";
34
0
        case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35
0
            return "enabled";
36
0
    }
37
0
    GGML_ABORT("fatal error");
38
0
}
39
40
0
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
41
0
    struct llama_sampler_chain_params result = {
42
0
        /*.no_perf                     =*/ true,
43
0
    };
44
45
0
    return result;
46
0
}
47
48
0
size_t llama_max_devices(void) {
49
0
    return 16;
50
0
}
51
52
0
bool llama_supports_mmap(void) {
53
0
    return llama_mmap::SUPPORTED;
54
0
}
55
56
0
bool llama_supports_mlock(void) {
57
0
    return llama_mlock::SUPPORTED;
58
0
}
59
60
0
bool llama_supports_gpu_offload(void) {
61
0
    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62
0
           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
63
0
           llama_supports_rpc();
64
0
}
65
66
0
bool llama_supports_rpc(void) {
67
0
    return ggml_backend_reg_by_name("RPC") != nullptr;
68
0
}
69
70
0
void llama_backend_init(void) {
71
0
    ggml_time_init();
72
73
    // needed to initialize f16 tables
74
0
    {
75
0
        struct ggml_init_params params = { 0, NULL, false };
76
0
        struct ggml_context * ctx = ggml_init(params);
77
0
        ggml_free(ctx);
78
0
    }
79
0
}
80
81
0
void llama_numa_init(enum ggml_numa_strategy numa) {
82
0
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
83
0
        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
84
0
        GGML_ASSERT(dev && "CPU backend is not loaded");
85
0
        auto * reg = ggml_backend_dev_backend_reg(dev);
86
0
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
87
0
        if (numa_init_fn) {
88
0
            numa_init_fn(numa);
89
0
        }
90
0
    }
91
0
}
92
93
0
void llama_backend_free(void) {
94
0
    ggml_quantize_free();
95
0
}
96
97
0
int64_t llama_time_us(void) {
98
0
    return ggml_time_us();
99
0
}
100
101
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
102
0
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
103
    // loading time will be recalculated after the first eval, so
104
    // we take page faults deferred by mmap() into consideration
105
0
    model.t_load_us = 0;
106
0
    time_meas tm(model.t_load_us);
107
108
0
    model.t_start_us = tm.t_start_us;
109
110
0
    try {
111
0
        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
112
113
0
        ml.print_info();
114
115
0
        model.hparams.vocab_only = params.vocab_only;
116
117
0
        try {
118
0
            model.load_arch(ml);
119
0
        } catch(const std::exception & e) {
120
0
            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
121
0
        }
122
0
        try {
123
0
            model.load_hparams(ml);
124
0
        } catch(const std::exception & e) {
125
0
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
126
0
        }
127
0
        if (model.arch == LLM_ARCH_CLIP) {
128
0
            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
129
0
        }
130
0
        try {
131
0
            model.load_vocab(ml);
132
0
        } catch(const std::exception & e) {
133
0
            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
134
0
        }
135
136
0
        model.load_stats(ml);
137
0
        model.print_info();
138
139
0
        if (params.vocab_only) {
140
0
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
141
0
            return 0;
142
0
        }
143
144
0
        if (!model.load_tensors(ml)) {
145
0
            return -2;
146
0
        }
147
0
    } catch (const std::exception & err) {
148
0
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
149
0
        return -1;
150
0
    }
151
152
0
    return 0;
153
0
}
154
155
static struct llama_model * llama_model_load_from_file_impl(
156
        const std::string & path_model,
157
        std::vector<std::string> & splits,
158
0
        struct llama_model_params params) {
159
0
    ggml_time_init();
160
161
0
    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
162
0
        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
163
0
        return nullptr;
164
0
    }
165
166
0
    unsigned cur_percentage = 0;
167
0
    if (params.progress_callback == NULL) {
168
0
        params.progress_callback_user_data = &cur_percentage;
169
0
        params.progress_callback = [](float progress, void * ctx) {
170
0
            unsigned * cur_percentage_p = (unsigned *) ctx;
171
0
            unsigned percentage = (unsigned) (100 * progress);
172
0
            while (percentage > *cur_percentage_p) {
173
0
                *cur_percentage_p = percentage;
174
0
                LLAMA_LOG_CONT(".");
175
0
                if (percentage >= 100) {
176
0
                    LLAMA_LOG_CONT("\n");
177
0
                }
178
0
            }
179
0
            return true;
180
0
        };
181
0
    }
182
183
0
    llama_model * model = new llama_model(params);
184
185
    // create list of devices to use with this model
186
0
    if (params.devices) {
187
0
        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
188
0
            model->devices.push_back(*dev);
189
0
        }
190
0
    } else {
191
        // default device selection
192
193
        // build list of available devices
194
0
        std::vector<ggml_backend_dev_t> gpus;
195
0
        std::vector<ggml_backend_dev_t> igpus;
196
0
        std::vector<ggml_backend_dev_t> rpc_servers;
197
198
0
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
199
0
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
200
0
            switch (ggml_backend_dev_type(dev)) {
201
0
                case GGML_BACKEND_DEVICE_TYPE_CPU:
202
0
                case GGML_BACKEND_DEVICE_TYPE_ACCEL:
203
                    // skip CPU backends since they are handled separately
204
0
                    break;
205
206
0
                case GGML_BACKEND_DEVICE_TYPE_GPU: {
207
0
                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
208
0
                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
209
0
                        rpc_servers.push_back(dev);
210
0
                    } else {
211
                        // check if there is already a GPU with the same device id
212
0
                        ggml_backend_dev_props props;
213
0
                        ggml_backend_dev_get_props(dev, &props);
214
0
                        auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
215
0
                            ggml_backend_dev_props d_props;
216
0
                            ggml_backend_dev_get_props(d, &d_props);
217
0
                            if (props.device_id && d_props.device_id) {
218
0
                                return strcmp(props.device_id, d_props.device_id) == 0;
219
0
                            }
220
0
                            return false;
221
0
                        });
222
223
0
                        if (it != gpus.end()) {
224
0
                            LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
225
0
                                    __func__,
226
0
                                    ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
227
0
                                    props.device_id ? props.device_id : "unknown id",
228
0
                                    ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
229
0
                        } else {
230
0
                            gpus.push_back(dev);
231
0
                        }
232
0
                    }
233
0
                    break;
234
0
                }
235
236
0
                case GGML_BACKEND_DEVICE_TYPE_IGPU:
237
0
                    igpus.push_back(dev);
238
0
                    break;
239
0
            }
240
0
        }
241
242
        // add RPC servers at the front of the list to minimize network transfers
243
0
        model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
244
245
        // add GPUs
246
0
        model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
247
248
        // add integrated GPUs only if no other devices were found
249
0
        if (model->devices.empty()) {
250
0
            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
251
0
        }
252
0
    }
253
254
    // if using single GPU mode, remove all except the main GPU
255
0
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
256
0
        if (params.main_gpu < 0) {
257
0
            model->devices.clear();
258
0
        } else {
259
0
            if (params.main_gpu >= (int)model->devices.size()) {
260
0
                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
261
0
                llama_model_free(model);
262
0
                return nullptr;
263
0
            }
264
0
            ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
265
0
            model->devices.clear();
266
0
            model->devices.push_back(main_gpu);
267
0
        }
268
0
    }
269
270
0
    for (auto * dev : model->devices) {
271
0
        ggml_backend_dev_props props;
272
0
        ggml_backend_dev_get_props(dev, &props);
273
0
        LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
274
0
                ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
275
0
                props.device_id ? props.device_id : "unknown id",
276
0
                props.memory_free/1024/1024);
277
0
    }
278
279
0
    const int status = llama_model_load(path_model, splits, *model, params);
280
0
    GGML_ASSERT(status <= 0);
281
0
    if (status < 0) {
282
0
        if (status == -1) {
283
0
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
284
0
        } else if (status == -2) {
285
0
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
286
0
        }
287
288
0
        llama_model_free(model);
289
0
        return nullptr;
290
0
    }
291
292
0
    return model;
293
0
}
294
295
// deprecated
296
struct llama_model * llama_load_model_from_file(
297
        const char * path_model,
298
0
        struct llama_model_params params) {
299
0
    return llama_model_load_from_file(path_model, params);
300
0
}
301
302
struct llama_model * llama_model_load_from_file(
303
        const char * path_model,
304
0
        struct llama_model_params params) {
305
0
    std::vector<std::string> splits = {};
306
0
    return llama_model_load_from_file_impl(path_model, splits, params);
307
0
}
308
309
struct llama_model * llama_model_load_from_splits(
310
        const char ** paths,
311
        size_t n_paths,
312
0
        struct llama_model_params params) {
313
0
    std::vector<std::string> splits;
314
0
    if (n_paths == 0) {
315
0
        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
316
0
        return nullptr;
317
0
    }
318
0
    splits.reserve(n_paths);
319
0
    for (size_t i = 0; i < n_paths; ++i) {
320
0
        splits.push_back(paths[i]);
321
0
    }
322
0
    return llama_model_load_from_file_impl(splits.front(), splits, params);
323
0
}
324
325
0
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
326
0
    llama_model_saver ms(*model);
327
0
    ms.add_kv_from_model();
328
0
    ms.add_tensors_from_model();
329
0
    ms.save(path_model);
330
0
}
331
332
//
333
// chat templates
334
//
335
336
int32_t llama_chat_apply_template(
337
                              const char * tmpl,
338
         const struct llama_chat_message * chat,
339
                                  size_t   n_msg,
340
                                    bool   add_ass,
341
                                    char * buf,
342
1.51k
                                 int32_t   length) {
343
1.51k
    const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
344
345
    // format the chat to string
346
1.51k
    std::vector<const llama_chat_message *> chat_vec;
347
1.51k
    chat_vec.resize(n_msg);
348
10.5k
    for (size_t i = 0; i < n_msg; i++) {
349
9.07k
        chat_vec[i] = &chat[i];
350
9.07k
    }
351
352
1.51k
    std::string formatted_chat;
353
1.51k
    llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
354
1.51k
    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
355
595
        return -1;
356
595
    }
357
917
    int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
358
917
    if (res < 0) {
359
0
        return res;
360
0
    }
361
917
    if (buf && length > 0) {
362
917
        strncpy(buf, formatted_chat.c_str(), length);
363
917
    }
364
917
    return res;
365
917
}
366
367
//
368
// model split
369
//
370
371
0
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
372
0
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
373
0
    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
374
0
        return strlen(split_path);
375
0
    }
376
0
    return 0;
377
0
}
378
379
0
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
380
0
    std::string str_split_path(split_path);
381
0
    char postfix[32];
382
0
    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
383
0
    std::string str_postfix(postfix);
384
385
    // check if split_prefix ends with postfix
386
0
    int size_prefix = str_split_path.size() - str_postfix.size();
387
0
    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
388
0
        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
389
0
        return size_prefix;
390
0
    }
391
392
0
    return 0;
393
0
}
394
395
0
const char * llama_print_system_info(void) {
396
0
    static std::string s;
397
0
    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
398
399
0
    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
400
0
        auto * reg = ggml_backend_reg_get(i);
401
0
        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
402
0
        if (get_features_fn) {
403
0
            ggml_backend_feature * features = get_features_fn(reg);
404
0
            s += ggml_backend_reg_name(reg);
405
0
            s += " : ";
406
0
            for (; features->name; features++) {
407
0
                s += features->name;
408
0
                s += " = ";
409
0
                s += features->value;
410
0
                s += " | ";
411
0
            }
412
0
        }
413
0
    }
414
415
0
    return s.c_str();
416
0
}
417