/src/llama.cpp/src/llama.cpp

Source
#include "llama-impl.h"

#include "llama-chat.h"
#include "llama-mmap.h"
#include "llama-vocab.h"
#include "llama-model-loader.h"
#include "llama-model-saver.h"
#include "llama-model.h"

#include "ggml.h"
#include "ggml-backend.h"

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

//
// interface implementation
//

const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
    switch (flash_attn_type) {
        case LLAMA_FLASH_ATTN_TYPE_AUTO:
            return "auto";
        case LLAMA_FLASH_ATTN_TYPE_DISABLED:
            return "disabled";
        case LLAMA_FLASH_ATTN_TYPE_ENABLED:
            return "enabled";
    }
    GGML_ABORT("fatal error");
}

struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
        /*.no_perf                     =*/ true,
    };

    return result;
}

size_t llama_max_devices(void) {
    return 16;
}

bool llama_supports_mmap(void) {
    return llama_mmap::SUPPORTED;
}

bool llama_supports_mlock(void) {
    return llama_mlock::SUPPORTED;
}

bool llama_supports_gpu_offload(void) {
    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
           llama_supports_rpc();
}

bool llama_supports_rpc(void) {
    return ggml_backend_reg_by_name("RPC") != nullptr;
}

void llama_backend_init(void) {
    ggml_time_init();

    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
}

void llama_numa_init(enum ggml_numa_strategy numa) {
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        GGML_ASSERT(dev && "CPU backend is not loaded");
        auto * reg = ggml_backend_dev_backend_reg(dev);
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
        if (numa_init_fn) {
            numa_init_fn(numa);
        }
    }
}

void llama_backend_free(void) {
    ggml_quantize_free();
}

int64_t llama_time_us(void) {
    return ggml_time_us();
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
    // loading time will be recalculated after the first eval, so
    // we take page faults deferred by mmap() into consideration
    model.t_load_us = 0;
    time_meas tm(model.t_load_us);

    model.t_start_us = tm.t_start_us;

    try {
        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

        model.hparams.vocab_only = params.vocab_only;

        try {
            model.load_arch(ml);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
        }
        try {
            model.load_hparams(ml);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
        }
        if (model.arch == LLM_ARCH_CLIP) {
            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
        }
        try {
            model.load_vocab(ml);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
        }

        model.load_stats(ml);
        model.print_info();

        if (params.vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
            return 0;
        }

        if (!model.load_tensors(ml)) {
            return -2;
        }
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
        return -1;
    }

    return 0;
}

static struct llama_model * llama_model_load_from_file_impl(
        const std::string & path_model,
        std::vector<std::string> & splits,
        struct llama_model_params params) {
    ggml_time_init();

    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
        return nullptr;
    }

    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
        params.progress_callback = [](float progress, void * ctx) {
            unsigned * cur_percentage_p = (unsigned *) ctx;
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                *cur_percentage_p = percentage;
                LLAMA_LOG_CONT(".");
                if (percentage >= 100) {
                    LLAMA_LOG_CONT("\n");
                }
            }
            return true;
        };
    }

    llama_model * model = new llama_model(params);

    // create list of devices to use with this model
    if (params.devices) {
        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
            model->devices.push_back(*dev);
        }
    } else {
        // default device selection

        // build list of available devices
        std::vector<ggml_backend_dev_t> gpus;
        std::vector<ggml_backend_dev_t> igpus;
        std::vector<ggml_backend_dev_t> rpc_servers;

        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
            switch (ggml_backend_dev_type(dev)) {
                case GGML_BACKEND_DEVICE_TYPE_CPU:
                case GGML_BACKEND_DEVICE_TYPE_ACCEL:
                    // skip CPU backends since they are handled separately
                    break;

                case GGML_BACKEND_DEVICE_TYPE_GPU: {
                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                        rpc_servers.push_back(dev);
                    } else {
                        // check if there is already a GPU with the same device id
                        ggml_backend_dev_props props;
                        ggml_backend_dev_get_props(dev, &props);
                        auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
                            ggml_backend_dev_props d_props;
                            ggml_backend_dev_get_props(d, &d_props);
                            if (props.device_id && d_props.device_id) {
                                return strcmp(props.device_id, d_props.device_id) == 0;
                            }
                            return false;
                        });

                        if (it != gpus.end()) {
                            LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
                                    __func__,
                                    ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
                                    props.device_id ? props.device_id : "unknown id",
                                    ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
                        } else {
                            gpus.push_back(dev);
                        }
                    }
                    break;
                }

                case GGML_BACKEND_DEVICE_TYPE_IGPU:
                    igpus.push_back(dev);
                    break;
            }
        }

        // add RPC servers at the front of the list to minimize network transfers
        model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());

        // add GPUs
        model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());

        // add integrated GPUs only if no other devices were found
        if (model->devices.empty()) {
            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
        }
    }

    // if using single GPU mode, remove all except the main GPU
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
        if (params.main_gpu < 0) {
            model->devices.clear();
        } else {
            if (params.main_gpu >= (int)model->devices.size()) {
                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
                llama_model_free(model);
                return nullptr;
            }
            ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
            model->devices.clear();
            model->devices.push_back(main_gpu);
        }
    }

    for (auto * dev : model->devices) {
        ggml_backend_dev_props props;
        ggml_backend_dev_get_props(dev, &props);
        LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
                ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
                props.device_id ? props.device_id : "unknown id",
                props.memory_free/1024/1024);
    }

    const int status = llama_model_load(path_model, splits, *model, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
        if (status == -1) {
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        } else if (status == -2) {
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
        }

        llama_model_free(model);
        return nullptr;
    }

    return model;
}

// deprecated
struct llama_model * llama_load_model_from_file(
        const char * path_model,
        struct llama_model_params params) {
    return llama_model_load_from_file(path_model, params);
}

struct llama_model * llama_model_load_from_file(
        const char * path_model,
        struct llama_model_params params) {
    std::vector<std::string> splits = {};
    return llama_model_load_from_file_impl(path_model, splits, params);
}

struct llama_model * llama_model_load_from_splits(
        const char ** paths,
        size_t n_paths,
        struct llama_model_params params) {
    std::vector<std::string> splits;
    if (n_paths == 0) {
        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
        return nullptr;
    }
    splits.reserve(n_paths);
    for (size_t i = 0; i < n_paths; ++i) {
        splits.push_back(paths[i]);
    }
    return llama_model_load_from_file_impl(splits.front(), splits, params);
}

void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
    llama_model_saver ms(*model);
    ms.add_kv_from_model();
    ms.add_tensors_from_model();
    ms.save(path_model);
}

//
// chat templates
//

int32_t llama_chat_apply_template(
                              const char * tmpl,
         const struct llama_chat_message * chat,
                                  size_t   n_msg,
                                    bool   add_ass,
                                    char * buf,
                                 int32_t   length) {
    const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);

    // format the chat to string
    std::vector<const llama_chat_message *> chat_vec;
    chat_vec.resize(n_msg);
    for (size_t i = 0; i < n_msg; i++) {
        chat_vec[i] = &chat[i];
    }

    std::string formatted_chat;
    llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
        return -1;
    }
    int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
    if (res < 0) {
        return res;
    }
    if (buf && length > 0) {
        strncpy(buf, formatted_chat.c_str(), length);
    }
    return res;
}

//
// model split
//

int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
        return strlen(split_path);
    }
    return 0;
}

int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
    std::string str_split_path(split_path);
    char postfix[32];
    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
    std::string str_postfix(postfix);

    // check if split_prefix ends with postfix
    int size_prefix = str_split_path.size() - str_postfix.size();
    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
        return size_prefix;
    }

    return 0;
}

const char * llama_print_system_info(void) {
    static std::string s;
    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.

    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
        auto * reg = ggml_backend_reg_get(i);
        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
        if (get_features_fn) {
            ggml_backend_feature * features = get_features_fn(reg);
            s += ggml_backend_reg_name(reg);
            s += " : ";
            for (; features->name; features++) {
                s += features->name;
                s += " = ";
                s += features->value;
                s += " | ";
            }
        }
    }

    return s.c_str();
}


Coverage Report

Created: 2025-11-28 06:56

Line	Count	Source
1		#include "llama-impl.h"
2
3		#include "llama-chat.h"
4		#include "llama-mmap.h"
5		#include "llama-vocab.h"
6		#include "llama-model-loader.h"
7		#include "llama-model-saver.h"
8		#include "llama-model.h"
9
10		#include "ggml.h"
11		#include "ggml-backend.h"
12
13		#include <algorithm>
14		#include <cstddef>
15		#include <cstdint>
16		#include <cstdio>
17		#include <cstring>
18		#include <ctime>
19
20		#if defined(_MSC_VER)
21		#pragma warning(disable: 4244 4267) // possible loss of data
22		#endif
23
24		//
25		// interface implementation
26		//
27
28	0	const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29	0	switch (flash_attn_type) {
30	0	case LLAMA_FLASH_ATTN_TYPE_AUTO:
31	0	return "auto";
32	0	case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33	0	return "disabled";
34	0	case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35	0	return "enabled";
36	0	}
37	0	GGML_ABORT("fatal error");
38	0	}
39
40	0	struct llama_sampler_chain_params llama_sampler_chain_default_params() {
41	0	struct llama_sampler_chain_params result = {
42	0	/.no_perf =/ true,
43	0	};
44
45	0	return result;
46	0	}
47
48	0	size_t llama_max_devices(void) {
49	0	return 16;
50	0	}
51
52	0	bool llama_supports_mmap(void) {
53	0	return llama_mmap::SUPPORTED;
54	0	}
55
56	0	bool llama_supports_mlock(void) {
57	0	return llama_mlock::SUPPORTED;
58	0	}
59
60	0	bool llama_supports_gpu_offload(void) {
61	0	return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr \|\|
62	0	ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr \|\|
63	0	llama_supports_rpc();
64	0	}
65
66	0	bool llama_supports_rpc(void) {
67	0	return ggml_backend_reg_by_name("RPC") != nullptr;
68	0	}
69
70	0	void llama_backend_init(void) {
71	0	ggml_time_init();
72
73		// needed to initialize f16 tables
74	0	{
75	0	struct ggml_init_params params = { 0, NULL, false };
76	0	struct ggml_context * ctx = ggml_init(params);
77	0	ggml_free(ctx);
78	0	}
79	0	}
80
81	0	void llama_numa_init(enum ggml_numa_strategy numa) {
82	0	if (numa != GGML_NUMA_STRATEGY_DISABLED) {
83	0	auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
84	0	GGML_ASSERT(dev && "CPU backend is not loaded");
85	0	auto * reg = ggml_backend_dev_backend_reg(dev);
86	0	auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
87	0	if (numa_init_fn) {
88	0	numa_init_fn(numa);
89	0	}
90	0	}
91	0	}
92
93	0	void llama_backend_free(void) {
94	0	ggml_quantize_free();
95	0	}
96
97	0	int64_t llama_time_us(void) {
98	0	return ggml_time_us();
99	0	}
100
101		// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
102	0	static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
103		// loading time will be recalculated after the first eval, so
104		// we take page faults deferred by mmap() into consideration
105	0	model.t_load_us = 0;
106	0	time_meas tm(model.t_load_us);
107
108	0	model.t_start_us = tm.t_start_us;
109
110	0	try {
111	0	llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
112
113	0	ml.print_info();
114
115	0	model.hparams.vocab_only = params.vocab_only;
116
117	0	try {
118	0	model.load_arch(ml);
119	0	} catch(const std::exception & e) {
120	0	throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
121	0	}
122	0	try {
123	0	model.load_hparams(ml);
124	0	} catch(const std::exception & e) {
125	0	throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
126	0	}
127	0	if (model.arch == LLM_ARCH_CLIP) {
128	0	throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
129	0	}
130	0	try {
131	0	model.load_vocab(ml);
132	0	} catch(const std::exception & e) {
133	0	throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
134	0	}
135
136	0	model.load_stats(ml);
137	0	model.print_info();
138
139	0	if (params.vocab_only) {
140	0	LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
141	0	return 0;
142	0	}
143
144	0	if (!model.load_tensors(ml)) {
145	0	return -2;
146	0	}
147	0	} catch (const std::exception & err) {
148	0	LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
149	0	return -1;
150	0	}
151
152	0	return 0;
153	0	}
154
155		static struct llama_model * llama_model_load_from_file_impl(
156		const std::string & path_model,
157		std::vector<std::string> & splits,
158	0	struct llama_model_params params) {
159	0	ggml_time_init();
160
161	0	if (!params.vocab_only && ggml_backend_reg_count() == 0) {
162	0	LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
163	0	return nullptr;
164	0	}
165
166	0	unsigned cur_percentage = 0;
167	0	if (params.progress_callback == NULL) {
168	0	params.progress_callback_user_data = &cur_percentage;
169	0	params.progress_callback = [](float progress, void * ctx) {
170	0	unsigned * cur_percentage_p = (unsigned *) ctx;
171	0	unsigned percentage = (unsigned) (100 * progress);
172	0	while (percentage > *cur_percentage_p) {
173	0	*cur_percentage_p = percentage;
174	0	LLAMA_LOG_CONT(".");
175	0	if (percentage >= 100) {
176	0	LLAMA_LOG_CONT("\n");
177	0	}
178	0	}
179	0	return true;
180	0	};
181	0	}
182
183	0	llama_model * model = new llama_model(params);
184
185		// create list of devices to use with this model
186	0	if (params.devices) {
187	0	for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
188	0	model->devices.push_back(*dev);
189	0	}
190	0	} else {
191		// default device selection
192
193		// build list of available devices
194	0	std::vector<ggml_backend_dev_t> gpus;
195	0	std::vector<ggml_backend_dev_t> igpus;
196	0	std::vector<ggml_backend_dev_t> rpc_servers;
197
198	0	for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
199	0	ggml_backend_dev_t dev = ggml_backend_dev_get(i);
200	0	switch (ggml_backend_dev_type(dev)) {
201	0	case GGML_BACKEND_DEVICE_TYPE_CPU:
202	0	case GGML_BACKEND_DEVICE_TYPE_ACCEL:
203		// skip CPU backends since they are handled separately
204	0	break;
205
206	0	case GGML_BACKEND_DEVICE_TYPE_GPU: {
207	0	ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
208	0	if (ggml_backend_reg_name(reg) == std::string("RPC")) {
209	0	rpc_servers.push_back(dev);
210	0	} else {
211		// check if there is already a GPU with the same device id
212	0	ggml_backend_dev_props props;
213	0	ggml_backend_dev_get_props(dev, &props);
214	0	auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
215	0	ggml_backend_dev_props d_props;
216	0	ggml_backend_dev_get_props(d, &d_props);
217	0	if (props.device_id && d_props.device_id) {
218	0	return strcmp(props.device_id, d_props.device_id) == 0;
219	0	}
220	0	return false;
221	0	});
222
223	0	if (it != gpus.end()) {
224	0	LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
225	0	__func__,
226	0	ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
227	0	props.device_id ? props.device_id : "unknown id",
228	0	ggml_backend_dev_name(it), ggml_backend_dev_description(it));
229	0	} else {
230	0	gpus.push_back(dev);
231	0	}
232	0	}
233	0	break;
234	0	}
235
236	0	case GGML_BACKEND_DEVICE_TYPE_IGPU:
237	0	igpus.push_back(dev);
238	0	break;
239	0	}
240	0	}
241
242		// add RPC servers at the front of the list to minimize network transfers
243	0	model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
244
245		// add GPUs
246	0	model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
247
248		// add integrated GPUs only if no other devices were found
249	0	if (model->devices.empty()) {
250	0	model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
251	0	}
252	0	}
253
254		// if using single GPU mode, remove all except the main GPU
255	0	if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
256	0	if (params.main_gpu < 0) {
257	0	model->devices.clear();
258	0	} else {
259	0	if (params.main_gpu >= (int)model->devices.size()) {
260	0	LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
261	0	llama_model_free(model);
262	0	return nullptr;
263	0	}
264	0	ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
265	0	model->devices.clear();
266	0	model->devices.push_back(main_gpu);
267	0	}
268	0	}
269
270	0	for (auto * dev : model->devices) {
271	0	ggml_backend_dev_props props;
272	0	ggml_backend_dev_get_props(dev, &props);
273	0	LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
274	0	ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
275	0	props.device_id ? props.device_id : "unknown id",
276	0	props.memory_free/1024/1024);
277	0	}
278
279	0	const int status = llama_model_load(path_model, splits, *model, params);
280	0	GGML_ASSERT(status <= 0);
281	0	if (status < 0) {
282	0	if (status == -1) {
283	0	LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
284	0	} else if (status == -2) {
285	0	LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
286	0	}
287
288	0	llama_model_free(model);
289	0	return nullptr;
290	0	}
291
292	0	return model;
293	0	}
294
295		// deprecated
296		struct llama_model * llama_load_model_from_file(
297		const char * path_model,
298	0	struct llama_model_params params) {
299	0	return llama_model_load_from_file(path_model, params);
300	0	}
301
302		struct llama_model * llama_model_load_from_file(
303		const char * path_model,
304	0	struct llama_model_params params) {
305	0	std::vector<std::string> splits = {};
306	0	return llama_model_load_from_file_impl(path_model, splits, params);
307	0	}
308
309		struct llama_model * llama_model_load_from_splits(
310		const char ** paths,
311		size_t n_paths,
312	0	struct llama_model_params params) {
313	0	std::vector<std::string> splits;
314	0	if (n_paths == 0) {
315	0	LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
316	0	return nullptr;
317	0	}
318	0	splits.reserve(n_paths);
319	0	for (size_t i = 0; i < n_paths; ++i) {
320	0	splits.push_back(paths[i]);
321	0	}
322	0	return llama_model_load_from_file_impl(splits.front(), splits, params);
323	0	}
324
325	0	void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
326	0	llama_model_saver ms(*model);
327	0	ms.add_kv_from_model();
328	0	ms.add_tensors_from_model();
329	0	ms.save(path_model);
330	0	}
331
332		//
333		// chat templates
334		//
335
336		int32_t llama_chat_apply_template(
337		const char * tmpl,
338		const struct llama_chat_message * chat,
339		size_t n_msg,
340		bool add_ass,
341		char * buf,
342	1.51k	int32_t length) {
343	1.51k	const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
344
345		// format the chat to string
346	1.51k	std::vector<const llama_chat_message *> chat_vec;
347	1.51k	chat_vec.resize(n_msg);
348	10.5k	for (size_t i = 0; i < n_msg; i++) {
349	9.07k	chat_vec[i] = &chat[i];
350	9.07k	}
351
352	1.51k	std::string formatted_chat;
353	1.51k	llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
354	1.51k	if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
355	595	return -1;
356	595	}
357	917	int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
358	917	if (res < 0) {
359	0	return res;
360	0	}
361	917	if (buf && length > 0) {
362	917	strncpy(buf, formatted_chat.c_str(), length);
363	917	}
364	917	return res;
365	917	}
366
367		//
368		// model split
369		//
370
371	0	int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
372	0	static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
373	0	if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
374	0	return strlen(split_path);
375	0	}
376	0	return 0;
377	0	}
378
379	0	int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
380	0	std::string str_split_path(split_path);
381	0	char postfix[32];
382	0	snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
383	0	std::string str_postfix(postfix);
384
385		// check if split_prefix ends with postfix
386	0	int size_prefix = str_split_path.size() - str_postfix.size();
387	0	if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
388	0	snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
389	0	return size_prefix;
390	0	}
391
392	0	return 0;
393	0	}
394
395	0	const char * llama_print_system_info(void) {
396	0	static std::string s;
397	0	s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
398
399	0	for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
400	0	auto * reg = ggml_backend_reg_get(i);
401	0	auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
402	0	if (get_features_fn) {
403	0	ggml_backend_feature * features = get_features_fn(reg);
404	0	s += ggml_backend_reg_name(reg);
405	0	s += " : ";
406	0	for (; features->name; features++) {
407	0	s += features->name;
408	0	s += " = ";
409	0	s += features->value;
410	0	s += " \| ";
411	0	}
412	0	}
413	0	}
414
415	0	return s.c_str();
416	0	}
417