/src/llama.cpp/src/llama.cpp

Source
#include "llama.h"

#include "llama-impl.h"

#include "llama-chat.h"
#include "llama-context.h"
#include "llama-mmap.h"
#include "llama-vocab.h"
#include "llama-model-loader.h"
#include "llama-model-saver.h"
#include "llama-model.h"

#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-backend.h"
#include "gguf.h"

#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <stdexcept>
#include <vector>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

//
// interface implementation
//

const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
    switch (flash_attn_type) {
        case LLAMA_FLASH_ATTN_TYPE_AUTO:
            return "auto";
        case LLAMA_FLASH_ATTN_TYPE_DISABLED:
            return "disabled";
        case LLAMA_FLASH_ATTN_TYPE_ENABLED:
            return "enabled";
    }
    GGML_ABORT("fatal error");
}

struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
        /*.no_perf =*/ true,
    };

    return result;
}

size_t llama_max_devices(void) {
    return 16;
}

size_t llama_max_tensor_buft_overrides() {
    return 4096;
}

bool llama_supports_mmap(void) {
    return llama_mmap::SUPPORTED;
}

bool llama_supports_mlock(void) {
    return llama_mlock::SUPPORTED;
}

bool llama_supports_gpu_offload(void) {
    if (!ggml_backend_reg_count()) {
        ggml_backend_load_all();
    }
    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
           llama_supports_rpc();
}

bool llama_supports_rpc(void) {
    if (!ggml_backend_reg_count()) {
        ggml_backend_load_all();
    }
    return ggml_backend_reg_by_name("RPC") != nullptr;
}

void llama_backend_init(void) {
    ggml_time_init();

    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }

    if (!ggml_backend_reg_count()) {
        ggml_backend_load_all();
    }
}

void llama_numa_init(enum ggml_numa_strategy numa) {
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        GGML_ASSERT(dev && "CPU backend is not loaded");
        auto * reg = ggml_backend_dev_backend_reg(dev);
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
        if (numa_init_fn) {
            numa_init_fn(numa);
        }
    }
}

void llama_backend_free(void) {
    ggml_quantize_free();
}

int64_t llama_time_us(void) {
    return ggml_time_us();
}

// returns true on success
static bool llama_prepare_model_devices(const llama_model_params & params, llama_model * model) {
    // create list of devices to use with this model
    if (params.devices) {
        if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
            size_t n_devs = 0;
            while (params.devices[n_devs]) {
                n_devs++;
            }
            if (n_devs == 0) {
                LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
                return false;
            }
            LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs);
            for (size_t i = 0; i < n_devs; ++i) {
                LLAMA_LOG_INFO("%s: - device %zu: %s\n", __func__, i, ggml_backend_dev_name(params.devices[i]));
            }
            model->get_split_state_ud.n_devices = n_devs;
            model->get_split_state_ud.model = model;
            model->devices.push_back({
                true, ggml_backend_meta_device(
                params.devices, n_devs, llama_meta_device_get_split_state, &model->get_split_state_ud)
            });
        } else {
            for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
                model->devices.push_back({false, *dev});
            }
        }
    } else {
        // default device selection

        // build list of available devices
        std::vector<llama_device> gpus;
        std::vector<llama_device> igpus;
        std::vector<llama_device> rpc_servers;

        if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
            std::vector<ggml_backend_dev_t> devs;
            devs.reserve(ggml_backend_dev_count());
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
                if (ggml_backend_dev_buffer_type(dev) == ggml_backend_cpu_buffer_type()) {
                    LLAMA_LOG_INFO("%s: skipping %s (%s) for tensor parallelism\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
                    continue;
                }
                devs.push_back(dev);
            }
            if (devs.empty()) {
                LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
                return false;
            }

            LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size());
            for (size_t i = 0; i < devs.size(); ++i) {
                LLAMA_LOG_INFO("%s: - device %zu: %s (%s)\n", __func__, i, ggml_backend_dev_name(devs[i]), ggml_backend_dev_description(devs[i]));
            }

            GGML_ASSERT(!devs.empty());
            model->get_split_state_ud.n_devices = devs.size();
            model->get_split_state_ud.model     = model;
            gpus.push_back({
                true, ggml_backend_meta_device(
                devs.data(), devs.size(), llama_meta_device_get_split_state, &model->get_split_state_ud)
            });
        } else {
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
                switch (ggml_backend_dev_type(dev)) {
                    case GGML_BACKEND_DEVICE_TYPE_CPU:
                    case GGML_BACKEND_DEVICE_TYPE_ACCEL:
                        // skip CPU backends since they are handled separately
                        break;

                    case GGML_BACKEND_DEVICE_TYPE_GPU: {
                        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                        if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                            rpc_servers.push_back({false, dev});
                        } else {
                            // check if there is already a GPU with the same device id
                            ggml_backend_dev_props props;
                            ggml_backend_dev_get_props(dev, &props);
                            auto it = std::find_if(gpus.begin(), gpus.end(), [&props](const llama_device & d) {
                                ggml_backend_dev_props d_props;
                                ggml_backend_dev_get_props(d.dev, &d_props);
                                if (props.device_id && d_props.device_id) {
                                    return strcmp(props.device_id, d_props.device_id) == 0;
                                }
                                return false;
                            });

                            if (it != gpus.end()) {
                                LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
                                        __func__,
                                        ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
                                        props.device_id ? props.device_id : "unknown id",
                                        ggml_backend_dev_name(it->dev), ggml_backend_dev_description(it->dev));
                            } else {
                                gpus.push_back({false, dev});
                            }
                        }
                        break;
                    }

                    case GGML_BACKEND_DEVICE_TYPE_IGPU:
                        if (igpus.empty()) {
                            igpus.push_back({false, dev});
                        }
                        break;
                    case GGML_BACKEND_DEVICE_TYPE_META:
                        GGML_ABORT("fatal error");
                }
            }
        }

        // add RPC servers at the front of the list to minimize network transfers
        model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());

        // add GPUs
        model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());

        // add integrated GPUs only if no discrete GPUs were found
        // (RPC servers do not count, otherwise the local iGPU would be dropped on iGPU+RPC setups)
        if (gpus.empty()) {
            model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
        }
    }

    // if using single GPU mode, remove all except the main GPU
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
        if (params.main_gpu < 0) {
            model->devices.clear();
        } else {
            if (params.main_gpu >= (int)model->devices.size()) {
                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
                return false;
            }
            llama_device main_gpu = model->devices[params.main_gpu];
            model->devices.clear();
            model->devices.push_back(main_gpu);
        }
    }

    for (const auto & dev : model->devices) {
        ggml_backend_dev_props props;
        ggml_backend_dev_get_props(dev.dev, &props);
        LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
                ggml_backend_dev_name(dev.dev), ggml_backend_dev_description(dev.dev),
                props.device_id ? props.device_id : "unknown id",
                props.memory_free/1024/1024);
    }

    return true;
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static std::pair<int, llama_model *> llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
        const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
    try {
        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();
        std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));

        bool ok = llama_prepare_model_devices(params, model_ptr.get());
        if (!ok) {
            return {-1, nullptr};
        }

        auto * model = dynamic_cast<llama_model_base *>(model_ptr.get());
        if (model == nullptr) {
            GGML_ABORT("fatal error: model does not implement llama_model_base");
        }

        // loading time will be recalculated after the first eval, so
        // we take page faults deferred by mmap() into consideration
        model->t_load_us = 0;
        time_meas tm(model->t_load_us);

        model->t_start_us = tm.t_start_us;

        model->hparams.vocab_only = params.vocab_only;
        model->hparams.no_alloc   = params.no_alloc;

        try {
            model->load_hparams(ml);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
        }
        if (model->arch == LLM_ARCH_CLIP) {
            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
        }
        try {
            model->load_vocab(ml);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
        }

        model->load_stats(ml);
        model->print_info();

        if (params.vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
            return {0, model_ptr.release()};
        }

        if (!model->load_tensors(ml)) {
            return {-2, nullptr};
        }

        return {0, model_ptr.release()};
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
        return {-1, nullptr};
    }
}

static struct llama_model * llama_model_load_from_file_impl(
        struct gguf_context * metadata,
        llama_model_set_tensor_data_t set_tensor_data,
        void * set_tensor_data_ud,
        const std::string & path_model,
        std::vector<std::string> & splits,
        FILE * file,
        struct llama_model_params params) {
    {
        int n_sources_defined = 0;
        if (metadata != nullptr) {
            n_sources_defined++;
        }
        if (!path_model.empty()) {
            n_sources_defined++;
        }
        if (file != nullptr) {
            n_sources_defined++;
        }
        if (n_sources_defined != 1) {
            LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
            return nullptr;
        }
    }
    ggml_time_init();

    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
        return nullptr;
    }

    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
        params.progress_callback = [](float progress, void * ctx) {
            unsigned * cur_percentage_p = (unsigned *) ctx;
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                *cur_percentage_p = percentage;
                LLAMA_LOG_CONT(".");
                if (percentage >= 100) {
                    LLAMA_LOG_CONT("\n");
                }
            }
            return true;
        };
    }

    const auto [status, model] = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
        if (status == -1) {
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        } else if (status == -2) {
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
        }

        if (model) {
            llama_model_free(model);
        }
        return nullptr;
    }

    return model;
}

struct llama_model * llama_model_init_from_user(
        struct gguf_context * metadata,
        llama_model_set_tensor_data_t set_tensor_data,
        void * set_tensor_data_ud,
        struct llama_model_params params) {
    GGML_ASSERT(metadata != nullptr);
    std::string path_model;
    std::vector<std::string> splits = {};
    params.use_mmap = false;
    params.use_extra_bufts = false;
    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
}
// deprecated
struct llama_model * llama_load_model_from_file(
        const char * path_model,
        struct llama_model_params params) {
    return llama_model_load_from_file(path_model, params);
}

struct llama_model * llama_model_load_from_file(
        const char * path_model,
        struct llama_model_params params) {
    std::vector<std::string> splits = {};
    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
}

struct llama_model * llama_model_load_from_splits(
        const char ** paths,
        size_t n_paths,
        struct llama_model_params params) {
    std::vector<std::string> splits;
    if (n_paths == 0) {
        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
        return nullptr;
    }
    splits.reserve(n_paths);
    for (size_t i = 0; i < n_paths; ++i) {
        splits.push_back(paths[i]);
    }
    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
}

struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
    if (!file) {
        LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
        return nullptr;
    }
    std::string path_model;
    std::vector<std::string> splits = {};
    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
}

void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
    llama_model_saver ms(model);
    ms.add_kv_from_model();
    ms.add_tensors_from_model();
    ms.save(path_model);
}

//
// chat templates
//

int32_t llama_chat_apply_template(
                              const char * tmpl,
         const struct llama_chat_message * chat,
                                  size_t   n_msg,
                                    bool   add_ass,
                                    char * buf,
                                 int32_t   length) {
    const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);

    // format the chat to string
    std::vector<const llama_chat_message *> chat_vec;
    chat_vec.resize(n_msg);
    for (size_t i = 0; i < n_msg; i++) {
        chat_vec[i] = &chat[i];
    }

    std::string formatted_chat;
    llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
        return -1;
    }
    int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
    if (res < 0) {
        return res;
    }
    if (buf && length > 0) {
        strncpy(buf, formatted_chat.c_str(), length);
    }
    return res;
}

//
// model split
//

int32_t llama_split_path(
    char * split_path,
    size_t maxlen,
    const char * path_prefix,
    int32_t split_no,
    int32_t split_count) {

    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";

    const int written = snprintf(
        split_path,
        maxlen,
        SPLIT_PATH_FORMAT,
        path_prefix,
        split_no + 1,
        split_count
    );

    if (written < 0 || (size_t) written >= maxlen) {
        return 0;
    }

    return (int32_t) written;
}

int32_t llama_split_prefix(
    char * split_prefix,
    size_t maxlen,
    const char * split_path,
    int32_t split_no,
    int32_t split_count) {

    const std::string str_split_path(split_path);

    char postfix[32];
    snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);

    const std::string str_postfix(postfix);
    if (str_split_path.size() <= str_postfix.size()) {
        return 0;
    }

    const size_t size_prefix = str_split_path.size() - str_postfix.size();

    if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
        const size_t copy_len = std::min(size_prefix + 1, maxlen);
        snprintf(split_prefix, copy_len, "%s", split_path);

        return (int32_t) size_prefix;
    }

    return 0;
}

const char * llama_print_system_info(void) {
    static std::string s;
    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.

    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
        auto * reg = ggml_backend_reg_get(i);
        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
        if (get_features_fn) {
            ggml_backend_feature * features = get_features_fn(reg);
            s += ggml_backend_reg_name(reg);
            s += " : ";
            for (; features->name; features++) {
                s += features->name;
                s += " = ";
                s += features->value;
                s += " | ";
            }
        }
    }

    return s.c_str();
}


Coverage Report

Created: 2026-06-13 06:24

Line	Count	Source
1		#include "llama.h"
2
3		#include "llama-impl.h"
4
5		#include "llama-chat.h"
6		#include "llama-context.h"
7		#include "llama-mmap.h"
8		#include "llama-vocab.h"
9		#include "llama-model-loader.h"
10		#include "llama-model-saver.h"
11		#include "llama-model.h"
12
13		#include "ggml.h"
14		#include "ggml-cpp.h"
15		#include "ggml-backend.h"
16		#include "gguf.h"
17
18		#include <algorithm>
19		#include <cassert>
20		#include <cinttypes>
21		#include <cstddef>
22		#include <cstdint>
23		#include <cstdio>
24		#include <cstring>
25		#include <ctime>
26		#include <stdexcept>
27		#include <vector>
28
29		#if defined(_MSC_VER)
30		#pragma warning(disable: 4244 4267) // possible loss of data
31		#endif
32
33		//
34		// interface implementation
35		//
36
37	0	const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
38	0	switch (flash_attn_type) {
39	0	case LLAMA_FLASH_ATTN_TYPE_AUTO:
40	0	return "auto";
41	0	case LLAMA_FLASH_ATTN_TYPE_DISABLED:
42	0	return "disabled";
43	0	case LLAMA_FLASH_ATTN_TYPE_ENABLED:
44	0	return "enabled";
45	0	}
46	0	GGML_ABORT("fatal error");
47	0	}
48
49	0	struct llama_sampler_chain_params llama_sampler_chain_default_params() {
50	0	struct llama_sampler_chain_params result = {
51	0	/.no_perf =/ true,
52	0	};
53
54	0	return result;
55	0	}
56
57	0	size_t llama_max_devices(void) {
58	0	return 16;
59	0	}
60
61	0	size_t llama_max_tensor_buft_overrides() {
62	0	return 4096;
63	0	}
64
65	0	bool llama_supports_mmap(void) {
66	0	return llama_mmap::SUPPORTED;
67	0	}
68
69	0	bool llama_supports_mlock(void) {
70	0	return llama_mlock::SUPPORTED;
71	0	}
72
73	0	bool llama_supports_gpu_offload(void) {
74	0	if (!ggml_backend_reg_count()) {
75	0	ggml_backend_load_all();
76	0	}
77	0	return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr \|\|
78	0	ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr \|\|
79	0	llama_supports_rpc();
80	0	}
81
82	0	bool llama_supports_rpc(void) {
83	0	if (!ggml_backend_reg_count()) {
84	0	ggml_backend_load_all();
85	0	}
86	0	return ggml_backend_reg_by_name("RPC") != nullptr;
87	0	}
88
89	0	void llama_backend_init(void) {
90	0	ggml_time_init();
91
92		// needed to initialize f16 tables
93	0	{
94	0	struct ggml_init_params params = { 0, NULL, false };
95	0	struct ggml_context * ctx = ggml_init(params);
96	0	ggml_free(ctx);
97	0	}
98
99	0	if (!ggml_backend_reg_count()) {
100	0	ggml_backend_load_all();
101	0	}
102	0	}
103
104	0	void llama_numa_init(enum ggml_numa_strategy numa) {
105	0	if (numa != GGML_NUMA_STRATEGY_DISABLED) {
106	0	auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
107	0	GGML_ASSERT(dev && "CPU backend is not loaded");
108	0	auto * reg = ggml_backend_dev_backend_reg(dev);
109	0	auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
110	0	if (numa_init_fn) {
111	0	numa_init_fn(numa);
112	0	}
113	0	}
114	0	}
115
116	0	void llama_backend_free(void) {
117	0	ggml_quantize_free();
118	0	}
119
120	0	int64_t llama_time_us(void) {
121	0	return ggml_time_us();
122	0	}
123
124		// returns true on success
125	0	static bool llama_prepare_model_devices(const llama_model_params & params, llama_model * model) {
126		// create list of devices to use with this model
127	0	if (params.devices) {
128	0	if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
129	0	size_t n_devs = 0;
130	0	while (params.devices[n_devs]) {
131	0	n_devs++;
132	0	}
133	0	if (n_devs == 0) {
134	0	LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
135	0	return false;
136	0	}
137	0	LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs);
138	0	for (size_t i = 0; i < n_devs; ++i) {
139	0	LLAMA_LOG_INFO("%s: - device %zu: %s\n", __func__, i, ggml_backend_dev_name(params.devices[i]));
140	0	}
141	0	model->get_split_state_ud.n_devices = n_devs;
142	0	model->get_split_state_ud.model = model;
143	0	model->devices.push_back({
144	0	true, ggml_backend_meta_device(
145	0	params.devices, n_devs, llama_meta_device_get_split_state, &model->get_split_state_ud)
146	0	});
147	0	} else {
148	0	for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
149	0	model->devices.push_back({false, *dev});
150	0	}
151	0	}
152	0	} else {
153		// default device selection
154
155		// build list of available devices
156	0	std::vector<llama_device> gpus;
157	0	std::vector<llama_device> igpus;
158	0	std::vector<llama_device> rpc_servers;
159
160	0	if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
161	0	std::vector<ggml_backend_dev_t> devs;
162	0	devs.reserve(ggml_backend_dev_count());
163	0	for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
164	0	auto * dev = ggml_backend_dev_get(i);
165	0	if (ggml_backend_dev_buffer_type(dev) == ggml_backend_cpu_buffer_type()) {
166	0	LLAMA_LOG_INFO("%s: skipping %s (%s) for tensor parallelism\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
167	0	continue;
168	0	}
169	0	devs.push_back(dev);
170	0	}
171	0	if (devs.empty()) {
172	0	LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
173	0	return false;
174	0	}
175
176	0	LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size());
177	0	for (size_t i = 0; i < devs.size(); ++i) {
178	0	LLAMA_LOG_INFO("%s: - device %zu: %s (%s)\n", __func__, i, ggml_backend_dev_name(devs[i]), ggml_backend_dev_description(devs[i]));
179	0	}
180
181	0	GGML_ASSERT(!devs.empty());
182	0	model->get_split_state_ud.n_devices = devs.size();
183	0	model->get_split_state_ud.model = model;
184	0	gpus.push_back({
185	0	true, ggml_backend_meta_device(
186	0	devs.data(), devs.size(), llama_meta_device_get_split_state, &model->get_split_state_ud)
187	0	});
188	0	} else {
189	0	for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
190	0	ggml_backend_dev_t dev = ggml_backend_dev_get(i);
191	0	switch (ggml_backend_dev_type(dev)) {
192	0	case GGML_BACKEND_DEVICE_TYPE_CPU:
193	0	case GGML_BACKEND_DEVICE_TYPE_ACCEL:
194		// skip CPU backends since they are handled separately
195	0	break;
196
197	0	case GGML_BACKEND_DEVICE_TYPE_GPU: {
198	0	ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
199	0	if (ggml_backend_reg_name(reg) == std::string("RPC")) {
200	0	rpc_servers.push_back({false, dev});
201	0	} else {
202		// check if there is already a GPU with the same device id
203	0	ggml_backend_dev_props props;
204	0	ggml_backend_dev_get_props(dev, &props);
205	0	auto it = std::find_if(gpus.begin(), gpus.end(), [&props](const llama_device & d) {
206	0	ggml_backend_dev_props d_props;
207	0	ggml_backend_dev_get_props(d.dev, &d_props);
208	0	if (props.device_id && d_props.device_id) {
209	0	return strcmp(props.device_id, d_props.device_id) == 0;
210	0	}
211	0	return false;
212	0	});
213
214	0	if (it != gpus.end()) {
215	0	LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
216	0	__func__,
217	0	ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
218	0	props.device_id ? props.device_id : "unknown id",
219	0	ggml_backend_dev_name(it->dev), ggml_backend_dev_description(it->dev));
220	0	} else {
221	0	gpus.push_back({false, dev});
222	0	}
223	0	}
224	0	break;
225	0	}
226
227	0	case GGML_BACKEND_DEVICE_TYPE_IGPU:
228	0	if (igpus.empty()) {
229	0	igpus.push_back({false, dev});
230	0	}
231	0	break;
232	0	case GGML_BACKEND_DEVICE_TYPE_META:
233	0	GGML_ABORT("fatal error");
234	0	}
235	0	}
236	0	}
237
238		// add RPC servers at the front of the list to minimize network transfers
239	0	model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
240
241		// add GPUs
242	0	model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
243
244		// add integrated GPUs only if no discrete GPUs were found
245		// (RPC servers do not count, otherwise the local iGPU would be dropped on iGPU+RPC setups)
246	0	if (gpus.empty()) {
247	0	model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
248	0	}
249	0	}
250
251		// if using single GPU mode, remove all except the main GPU
252	0	if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
253	0	if (params.main_gpu < 0) {
254	0	model->devices.clear();
255	0	} else {
256	0	if (params.main_gpu >= (int)model->devices.size()) {
257	0	LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
258	0	return false;
259	0	}
260	0	llama_device main_gpu = model->devices[params.main_gpu];
261	0	model->devices.clear();
262	0	model->devices.push_back(main_gpu);
263	0	}
264	0	}
265
266	0	for (const auto & dev : model->devices) {
267	0	ggml_backend_dev_props props;
268	0	ggml_backend_dev_get_props(dev.dev, &props);
269	0	LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
270	0	ggml_backend_dev_name(dev.dev), ggml_backend_dev_description(dev.dev),
271	0	props.device_id ? props.device_id : "unknown id",
272	0	props.memory_free/1024/1024);
273	0	}
274
275	0	return true;
276	0	}
277
278		// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
279		static std::pair<int, llama_model > llama_model_load(struct gguf_context metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
280	0	const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
281	0	try {
282	0	llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
283	0	params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
284
285	0	ml.print_info();
286	0	std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));
287
288	0	bool ok = llama_prepare_model_devices(params, model_ptr.get());
289	0	if (!ok) {
290	0	return {-1, nullptr};
291	0	}
292
293	0	auto * model = dynamic_cast<llama_model_base *>(model_ptr.get());
294	0	if (model == nullptr) {
295	0	GGML_ABORT("fatal error: model does not implement llama_model_base");
296	0	}
297
298		// loading time will be recalculated after the first eval, so
299		// we take page faults deferred by mmap() into consideration
300	0	model->t_load_us = 0;
301	0	time_meas tm(model->t_load_us);
302
303	0	model->t_start_us = tm.t_start_us;
304
305	0	model->hparams.vocab_only = params.vocab_only;
306	0	model->hparams.no_alloc = params.no_alloc;
307
308	0	try {
309	0	model->load_hparams(ml);
310	0	} catch(const std::exception & e) {
311	0	throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
312	0	}
313	0	if (model->arch == LLM_ARCH_CLIP) {
314	0	throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
315	0	}
316	0	try {
317	0	model->load_vocab(ml);
318	0	} catch(const std::exception & e) {
319	0	throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
320	0	}
321
322	0	model->load_stats(ml);
323	0	model->print_info();
324
325	0	if (params.vocab_only) {
326	0	LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
327	0	return {0, model_ptr.release()};
328	0	}
329
330	0	if (!model->load_tensors(ml)) {
331	0	return {-2, nullptr};
332	0	}
333
334	0	return {0, model_ptr.release()};
335	0	} catch (const std::exception & err) {
336	0	LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
337	0	return {-1, nullptr};
338	0	}
339	0	}
340
341		static struct llama_model * llama_model_load_from_file_impl(
342		struct gguf_context * metadata,
343		llama_model_set_tensor_data_t set_tensor_data,
344		void * set_tensor_data_ud,
345		const std::string & path_model,
346		std::vector<std::string> & splits,
347		FILE * file,
348	0	struct llama_model_params params) {
349	0	{
350	0	int n_sources_defined = 0;
351	0	if (metadata != nullptr) {
352	0	n_sources_defined++;
353	0	}
354	0	if (!path_model.empty()) {
355	0	n_sources_defined++;
356	0	}
357	0	if (file != nullptr) {
358	0	n_sources_defined++;
359	0	}
360	0	if (n_sources_defined != 1) {
361	0	LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
362	0	return nullptr;
363	0	}
364	0	}
365	0	ggml_time_init();
366
367	0	if (!params.vocab_only && ggml_backend_reg_count() == 0) {
368	0	LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
369	0	return nullptr;
370	0	}
371
372	0	unsigned cur_percentage = 0;
373	0	if (params.progress_callback == NULL) {
374	0	params.progress_callback_user_data = &cur_percentage;
375	0	params.progress_callback = [](float progress, void * ctx) {
376	0	unsigned * cur_percentage_p = (unsigned *) ctx;
377	0	unsigned percentage = (unsigned) (100 * progress);
378	0	while (percentage > *cur_percentage_p) {
379	0	*cur_percentage_p = percentage;
380	0	LLAMA_LOG_CONT(".");
381	0	if (percentage >= 100) {
382	0	LLAMA_LOG_CONT("\n");
383	0	}
384	0	}
385	0	return true;
386	0	};
387	0	}
388
389	0	const auto [status, model] = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, params);
390	0	GGML_ASSERT(status <= 0);
391	0	if (status < 0) {
392	0	if (status == -1) {
393	0	LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
394	0	} else if (status == -2) {
395	0	LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
396	0	}
397
398	0	if (model) {
399	0	llama_model_free(model);
400	0	}
401	0	return nullptr;
402	0	}
403
404	0	return model;
405	0	}
406
407		struct llama_model * llama_model_init_from_user(
408		struct gguf_context * metadata,
409		llama_model_set_tensor_data_t set_tensor_data,
410		void * set_tensor_data_ud,
411	0	struct llama_model_params params) {
412	0	GGML_ASSERT(metadata != nullptr);
413	0	std::string path_model;
414	0	std::vector<std::string> splits = {};
415	0	params.use_mmap = false;
416	0	params.use_extra_bufts = false;
417	0	return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /file/ nullptr, params);
418	0	}
419		// deprecated
420		struct llama_model * llama_load_model_from_file(
421		const char * path_model,
422	0	struct llama_model_params params) {
423	0	return llama_model_load_from_file(path_model, params);
424	0	}
425
426		struct llama_model * llama_model_load_from_file(
427		const char * path_model,
428	0	struct llama_model_params params) {
429	0	std::vector<std::string> splits = {};
430	0	return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /file/ nullptr, params);
431	0	}
432
433		struct llama_model * llama_model_load_from_splits(
434		const char ** paths,
435		size_t n_paths,
436	0	struct llama_model_params params) {
437	0	std::vector<std::string> splits;
438	0	if (n_paths == 0) {
439	0	LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
440	0	return nullptr;
441	0	}
442	0	splits.reserve(n_paths);
443	0	for (size_t i = 0; i < n_paths; ++i) {
444	0	splits.push_back(paths[i]);
445	0	}
446	0	return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /file/ nullptr, params);
447	0	}
448
449	0	struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
450	0	if (!file) {
451	0	LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
452	0	return nullptr;
453	0	}
454	0	std::string path_model;
455	0	std::vector<std::string> splits = {};
456	0	return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
457	0	}
458
459	0	void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
460	0	llama_model_saver ms(model);
461	0	ms.add_kv_from_model();
462	0	ms.add_tensors_from_model();
463	0	ms.save(path_model);
464	0	}
465
466		//
467		// chat templates
468		//
469
470		int32_t llama_chat_apply_template(
471		const char * tmpl,
472		const struct llama_chat_message * chat,
473		size_t n_msg,
474		bool add_ass,
475		char * buf,
476	0	int32_t length) {
477	0	const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
478
479		// format the chat to string
480	0	std::vector<const llama_chat_message *> chat_vec;
481	0	chat_vec.resize(n_msg);
482	0	for (size_t i = 0; i < n_msg; i++) {
483	0	chat_vec[i] = &chat[i];
484	0	}
485
486	0	std::string formatted_chat;
487	0	llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
488	0	if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
489	0	return -1;
490	0	}
491	0	int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
492	0	if (res < 0) {
493	0	return res;
494	0	}
495	0	if (buf && length > 0) {
496	0	strncpy(buf, formatted_chat.c_str(), length);
497	0	}
498	0	return res;
499	0	}
500
501		//
502		// model split
503		//
504
505		int32_t llama_split_path(
506		char * split_path,
507		size_t maxlen,
508		const char * path_prefix,
509		int32_t split_no,
510	0	int32_t split_count) {
511
512	0	static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
513
514	0	const int written = snprintf(
515	0	split_path,
516	0	maxlen,
517	0	SPLIT_PATH_FORMAT,
518	0	path_prefix,
519	0	split_no + 1,
520	0	split_count
521	0	);
522
523	0	if (written < 0 \|\| (size_t) written >= maxlen) {
524	0	return 0;
525	0	}
526
527	0	return (int32_t) written;
528	0	}
529
530		int32_t llama_split_prefix(
531		char * split_prefix,
532		size_t maxlen,
533		const char * split_path,
534		int32_t split_no,
535	0	int32_t split_count) {
536
537	0	const std::string str_split_path(split_path);
538
539	0	char postfix[32];
540	0	snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
541
542	0	const std::string str_postfix(postfix);
543	0	if (str_split_path.size() <= str_postfix.size()) {
544	0	return 0;
545	0	}
546
547	0	const size_t size_prefix = str_split_path.size() - str_postfix.size();
548
549	0	if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
550	0	const size_t copy_len = std::min(size_prefix + 1, maxlen);
551	0	snprintf(split_prefix, copy_len, "%s", split_path);
552
553	0	return (int32_t) size_prefix;
554	0	}
555
556	0	return 0;
557	0	}
558
559	0	const char * llama_print_system_info(void) {
560	0	static std::string s;
561	0	s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
562
563	0	for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
564	0	auto * reg = ggml_backend_reg_get(i);
565	0	auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
566	0	if (get_features_fn) {
567	0	ggml_backend_feature * features = get_features_fn(reg);
568	0	s += ggml_backend_reg_name(reg);
569	0	s += " : ";
570	0	for (; features->name; features++) {
571	0	s += features->name;
572	0	s += " = ";
573	0	s += features->value;
574	0	s += " \| ";
575	0	}
576	0	}
577	0	}
578
579	0	return s.c_str();
580	0	}
581