/src/llama.cpp/src/llama-adapter.cpp

Source
#include "llama-adapter.h"

#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-model.h"

#include <map>
#include <cassert>
#include <sstream>
#include <stdexcept>

// vec

ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }

    return tensors[il];
}

ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
    }

    return cur;
}

bool llama_adapter_cvec::init(const llama_model & model) {
    const auto & hparams = model.hparams;

    GGML_ASSERT(tensors.empty());
    GGML_ASSERT(ctxs.empty());
    GGML_ASSERT(bufs.empty());

    // create a context for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            ggml_init_params params = {
                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };

            ggml_context * ctx = ggml_init(params);
            if (!ctx) {
                return nullptr;
            }

            ctx_map[buft] = ctx;
            ctxs.emplace_back(ctx);

            return ctx;
        }

        return it->second;
    };

    // make tensors
    tensors.reserve(hparams.n_layer);
    tensors.push_back(nullptr); // there's never a tensor for layer 0
    for (size_t il = 1; il < hparams.n_layer; il++) {
        ggml_backend_buffer_type_t buft = model.select_buft(il);
        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
            return false;
        }
        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
        tensors.push_back(tensor);
    }

    // allocate tensors / buffers and zero
    bufs.reserve(ctx_map.size());
    for (auto it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
        bufs.emplace_back(buf);
    }

    return true;
}

bool llama_adapter_cvec::apply(
        const llama_model & model,
        const float * data,
        size_t len,
        int32_t n_embd,
        int32_t il_start,
        int32_t il_end) {
    const auto & hparams = model.hparams;

    if (data == nullptr) {
        // disable the current control vector (but leave allocated for later)
        layer_start = -1;
        layer_end   = -1;
        return true;
    }

    if (n_embd != (int) hparams.n_embd) {
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
        return false;
    }

    if (tensors.empty()) {
        if (!init(model)) {
            return false;
        }
    }

    layer_start = il_start;
    layer_end   = il_end;

    for (size_t il = 1; il < hparams.n_layer; il++) {
        assert(tensors[il] != nullptr);

        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
        if (off + n_embd <= len) {
            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
        }
    }

    return true;
}

// lora

llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    const std::string name(w->name);

    const auto pos = ab_map.find(name);
    if (pos != ab_map.end()) {
        return &pos->second;
    }

    return nullptr;
}

static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

    llama_model & model = adapter.model;

    ggml_context * ctx_init;
    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx_init,
    };

    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
    if (!ctx_gguf) {
        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
    }

    ggml_context_ptr ctx { ctx_init };

    // check metadata
    {
        const gguf_context * gguf_ctx = ctx_gguf.get();

        LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);

        // get metadata as string
        for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
            gguf_type type = gguf_get_kv_type(gguf_ctx, i);
            const std::string type_name =
                type == GGUF_TYPE_ARRAY
                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
                : gguf_type_name(type);
            const char * name = gguf_get_key(gguf_ctx, i);
            const std::string value = gguf_kv_to_str(gguf_ctx, i);

            if (type != GGUF_TYPE_ARRAY) {
                adapter.gguf_kv.emplace(name, value);
            }

            const size_t MAX_VALUE_LEN = 40;
            std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
            replace_all(print_value, "\n", "\\n");

            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
        }

        auto get_kv_str = [&](const std::string & key) -> std::string {
            int id = gguf_find_key(gguf_ctx, key.c_str());
            return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
        };
        auto get_kv_f32 = [&](const std::string & key) -> float {
            int id = gguf_find_key(gguf_ctx, key.c_str());
            return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
        };
        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);

        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
        if (general_type != "adapter") {
            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
        }

        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
        auto general_arch = llm_arch_from_string(general_arch_str);
        if (general_arch != model.arch) {
            throw std::runtime_error("model arch and LoRA arch mismatch");
        }

        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
        if (adapter_type != "lora") {
            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
        }

        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));

        // parse alora invocation sequence vector
        const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
        const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (kid >= 0) {
            if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
                throw std::runtime_error("invalid gguf type for " + key);
            }
            const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
            if (arr_type != GGUF_TYPE_UINT32) {
                throw std::runtime_error("invalid gguf element type for " + key);
            }
            const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
            const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
            adapter.alora_invocation_tokens.resize(seq_len);
            std::copy(
                (const llama_token *)data,
                (const llama_token *)data + seq_len,
                adapter.alora_invocation_tokens.begin());
        }
    }

    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());

    // contexts for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            // add a new context
            ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
            ggml_context * buft_ctx = ggml_init(params);
            if (!buft_ctx) {
                return nullptr;
            }
            ctx_map[buft] = buft_ctx;
            adapter.ctxs.emplace_back(buft_ctx);
            return buft_ctx;
        };
        return it->second;
    };

    // bundle lora_a and lora_b into pairs
    std::map<std::string, llama_adapter_lora_weight> ab_map;
    auto str_endswith = [](const std::string & str, const std::string & suffix) {
        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
    };

    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
        std::string name(cur->name);
        if (str_endswith(name, ".lora_a")) {
            replace_all(name, ".lora_a", "");
            if (ab_map.find(name) == ab_map.end()) {
                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
            } else {
                ab_map[name].a = cur;
            }
        } else if (str_endswith(name, ".lora_b")) {
            replace_all(name, ".lora_b", "");
            if (ab_map.find(name) == ab_map.end()) {
                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
            } else {
                ab_map[name].b = cur;
            }
        } else if (str_endswith(name, "_norm.weight")) {
            // TODO: add support for norm vector
            // for now, we don't really care because most adapters still work fine without it
            continue;
        } else {
            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
        }
    }

    // get extra buffer types of the CPU
    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        if (!cpu_dev) {
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
        }
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);

        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");

        if (ggml_backend_dev_get_extra_bufts_fn) {
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
            while (extra_bufts && *extra_bufts) {
                buft_extra.emplace_back(*extra_bufts);
                ++extra_bufts;
            }
        }
    }

    // add tensors
    for (auto & it : ab_map) {
        const std::string & name = it.first;
        llama_adapter_lora_weight & w = it.second;
        bool is_token_embd = str_endswith(name, "token_embd.weight");

        if (!w.a || !w.b) {
            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
        }

        // device buft and device ctx
        const auto * model_tensor = model.get_tensor(name.c_str());
        if (!model_tensor) {
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }

        auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);

        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
        for (auto & ex : buft_extra) {
            if (ex == buft) {
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                if (!cpu_dev) {
                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);

                break;
            }
        }

        LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

        ggml_context * dev_ctx = ctx_for_buft(buft);
        // validate tensor shape
        if (is_token_embd) {
            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
            if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
            }
        } else {
            if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
            }
            if (w.a->ne[1] != w.b->ne[0]) {
                throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
            }
        }

        // save tensor to adapter
        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
    }

    // allocate tensors / buffers and zero
    {
        adapter.ctxs.reserve(ctx_map.size());
        adapter.bufs.reserve(ctx_map.size());
        for (auto & it : ctx_map) {
            ggml_backend_buffer_type_t buft = it.first;
            ggml_context * ctx_dev = it.second;
            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
            if (!buf) {
                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
            }
            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
            adapter.bufs.emplace_back(std::move(buf));
        }
    }

    // set tensor data
    {
        llama_file gguf_file(path_lora, "rb");
        std::vector<uint8_t> read_buf;
        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
            size_t size = ggml_nbytes(orig);
            read_buf.resize(size);
            gguf_file.seek(offs, SEEK_SET);
            gguf_file.read_raw(read_buf.data(), size);
            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
        };
        for (auto & it : adapter.ab_map) {
            auto orig = ab_map[it.first];
            auto dev  = it.second;
            set_tensor(orig.a, dev.a);
            set_tensor(orig.b, dev.b);
        }
    }

    // update number of nodes used
    model.n_lora_nodes += adapter.get_n_nodes();

    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}

llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
    llama_adapter_lora * adapter = new llama_adapter_lora(*model);

    try {
        llama_adapter_lora_init_impl(path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());

        delete adapter;
    }

    return nullptr;
}

int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
    const auto & it = adapter->gguf_kv.find(key);
    if (it == adapter->gguf_kv.end()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    return snprintf(buf, buf_size, "%s", it->second.c_str());
}

int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
    return (int)adapter->gguf_kv.size();
}

int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    auto it = adapter->gguf_kv.begin();
    std::advance(it, i);
    return snprintf(buf, buf_size, "%s", it->first.c_str());
}

int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    auto it = adapter->gguf_kv.begin();
    std::advance(it, i);
    return snprintf(buf, buf_size, "%s", it->second.c_str());
}

void llama_adapter_lora_free(llama_adapter_lora * adapter) {
    // update number of nodes used
    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
    adapter->model.n_lora_nodes -= adapter->get_n_nodes();

    delete adapter;
}

uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
    if (!adapter) {
        return 0;
    }
    return adapter->alora_invocation_tokens.size();
}

const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
    GGML_ASSERT(adapter);
    return adapter->alora_invocation_tokens.data();
}

Coverage Report

Created: 2026-01-11 07:13

Line	Count	Source
1		#include "llama-adapter.h"
2
3		#include "llama-impl.h"
4		#include "llama-mmap.h"
5		#include "llama-model.h"
6
7		#include <map>
8		#include <cassert>
9		#include <sstream>
10		#include <stdexcept>
11
12		// vec
13
14	0	ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15	0	if (il < 0 \|\| il < layer_start \|\| il > layer_end \|\| (size_t) il >= tensors.size()) {
16	0	return nullptr;
17	0	}
18
19	0	return tensors[il];
20	0	}
21
22	0	ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
23	0	ggml_tensor * layer_dir = tensor_for(il);
24	0	if (layer_dir != nullptr) {
25	0	cur = ggml_add(ctx, cur, layer_dir);
26	0	}
27
28	0	return cur;
29	0	}
30
31	0	bool llama_adapter_cvec::init(const llama_model & model) {
32	0	const auto & hparams = model.hparams;
33
34	0	GGML_ASSERT(tensors.empty());
35	0	GGML_ASSERT(ctxs.empty());
36	0	GGML_ASSERT(bufs.empty());
37
38		// create a context for each buffer type
39	0	std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
40	0	auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41	0	auto it = ctx_map.find(buft);
42	0	if (it == ctx_map.end()) {
43	0	ggml_init_params params = {
44	0	/.mem_size =/ hparams.n_layer*ggml_tensor_overhead(),
45	0	/.mem_buffer =/ NULL,
46	0	/.no_alloc =/ true,
47	0	};
48
49	0	ggml_context * ctx = ggml_init(params);
50	0	if (!ctx) {
51	0	return nullptr;
52	0	}
53
54	0	ctx_map[buft] = ctx;
55	0	ctxs.emplace_back(ctx);
56
57	0	return ctx;
58	0	}
59
60	0	return it->second;
61	0	};
62
63		// make tensors
64	0	tensors.reserve(hparams.n_layer);
65	0	tensors.push_back(nullptr); // there's never a tensor for layer 0
66	0	for (size_t il = 1; il < hparams.n_layer; il++) {
67	0	ggml_backend_buffer_type_t buft = model.select_buft(il);
68	0	ggml_context * ctx = ctx_for_buft(buft);
69	0	if (!ctx) {
70	0	LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71	0	return false;
72	0	}
73	0	ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
74	0	tensors.push_back(tensor);
75	0	}
76
77		// allocate tensors / buffers and zero
78	0	bufs.reserve(ctx_map.size());
79	0	for (auto it : ctx_map) {
80	0	ggml_backend_buffer_type_t buft = it.first;
81	0	ggml_context * ctx = it.second;
82	0	ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83	0	if (!buf) {
84	0	LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85	0	return false;
86	0	}
87	0	ggml_backend_buffer_clear(buf, 0);
88	0	bufs.emplace_back(buf);
89	0	}
90
91	0	return true;
92	0	}
93
94		bool llama_adapter_cvec::apply(
95		const llama_model & model,
96		const float * data,
97		size_t len,
98		int32_t n_embd,
99		int32_t il_start,
100	0	int32_t il_end) {
101	0	const auto & hparams = model.hparams;
102
103	0	if (data == nullptr) {
104		// disable the current control vector (but leave allocated for later)
105	0	layer_start = -1;
106	0	layer_end = -1;
107	0	return true;
108	0	}
109
110	0	if (n_embd != (int) hparams.n_embd) {
111	0	LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112	0	return false;
113	0	}
114
115	0	if (tensors.empty()) {
116	0	if (!init(model)) {
117	0	return false;
118	0	}
119	0	}
120
121	0	layer_start = il_start;
122	0	layer_end = il_end;
123
124	0	for (size_t il = 1; il < hparams.n_layer; il++) {
125	0	assert(tensors[il] != nullptr);
126
127	0	const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128	0	if (off + n_embd <= len) {
129	0	ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
130	0	}
131	0	}
132
133	0	return true;
134	0	}
135
136		// lora
137
138	0	llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139	0	const std::string name(w->name);
140
141	0	const auto pos = ab_map.find(name);
142	0	if (pos != ab_map.end()) {
143	0	return &pos->second;
144	0	}
145
146	0	return nullptr;
147	0	}
148
149	0	static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
150	0	LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
152	0	llama_model & model = adapter.model;
153
154	0	ggml_context * ctx_init;
155	0	gguf_init_params meta_gguf_params = {
156	0	/* .no_alloc = */ true,
157	0	/* .ctx = */ &ctx_init,
158	0	};
159
160	0	gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
161	0	if (!ctx_gguf) {
162	0	throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
163	0	}
164
165	0	ggml_context_ptr ctx { ctx_init };
166
167		// check metadata
168	0	{
169	0	const gguf_context * gguf_ctx = ctx_gguf.get();
170
171	0	LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
172
173		// get metadata as string
174	0	for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
175	0	gguf_type type = gguf_get_kv_type(gguf_ctx, i);
176	0	const std::string type_name =
177	0	type == GGUF_TYPE_ARRAY
178	0	? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
179	0	: gguf_type_name(type);
180	0	const char * name = gguf_get_key(gguf_ctx, i);
181	0	const std::string value = gguf_kv_to_str(gguf_ctx, i);
182
183	0	if (type != GGUF_TYPE_ARRAY) {
184	0	adapter.gguf_kv.emplace(name, value);
185	0	}
186
187	0	const size_t MAX_VALUE_LEN = 40;
188	0	std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
189	0	replace_all(print_value, "\n", "\\n");
190
191	0	LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
192	0	}
193
194	0	auto get_kv_str = [&](const std::string & key) -> std::string {
195	0	int id = gguf_find_key(gguf_ctx, key.c_str());
196	0	return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
197	0	};
198	0	auto get_kv_f32 = [&](const std::string & key) -> float {
199	0	int id = gguf_find_key(gguf_ctx, key.c_str());
200	0	return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
201	0	};
202	0	LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
203
204	0	auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
205	0	if (general_type != "adapter") {
206	0	throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
207	0	}
208
209	0	auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
210	0	auto general_arch = llm_arch_from_string(general_arch_str);
211	0	if (general_arch != model.arch) {
212	0	throw std::runtime_error("model arch and LoRA arch mismatch");
213	0	}
214
215	0	auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
216	0	if (adapter_type != "lora") {
217	0	throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
218	0	}
219
220	0	adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
221
222		// parse alora invocation sequence vector
223	0	const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
224	0	const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
225	0	if (kid >= 0) {
226	0	if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
227	0	throw std::runtime_error("invalid gguf type for " + key);
228	0	}
229	0	const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
230	0	if (arr_type != GGUF_TYPE_UINT32) {
231	0	throw std::runtime_error("invalid gguf element type for " + key);
232	0	}
233	0	const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
234	0	const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
235	0	adapter.alora_invocation_tokens.resize(seq_len);
236	0	std::copy(
237	0	(const llama_token *)data,
238	0	(const llama_token *)data + seq_len,
239	0	adapter.alora_invocation_tokens.begin());
240	0	}
241	0	}
242
243	0	int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
244
245		// contexts for each buffer type
246	0	std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
247	0	auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
248	0	auto it = ctx_map.find(buft);
249	0	if (it == ctx_map.end()) {
250		// add a new context
251	0	ggml_init_params params = {
252	0	/.mem_size =/ n_tensors*ggml_tensor_overhead(),
253	0	/.mem_buffer =/ NULL,
254	0	/.no_alloc =/ true,
255	0	};
256	0	ggml_context * buft_ctx = ggml_init(params);
257	0	if (!buft_ctx) {
258	0	return nullptr;
259	0	}
260	0	ctx_map[buft] = buft_ctx;
261	0	adapter.ctxs.emplace_back(buft_ctx);
262	0	return buft_ctx;
263	0	};
264	0	return it->second;
265	0	};
266
267		// bundle lora_a and lora_b into pairs
268	0	std::map<std::string, llama_adapter_lora_weight> ab_map;
269	0	auto str_endswith = [](const std::string & str, const std::string & suffix) {
270	0	return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
271	0	};
272
273	0	for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
274	0	std::string name(cur->name);
275	0	if (str_endswith(name, ".lora_a")) {
276	0	replace_all(name, ".lora_a", "");
277	0	if (ab_map.find(name) == ab_map.end()) {
278	0	ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
279	0	} else {
280	0	ab_map[name].a = cur;
281	0	}
282	0	} else if (str_endswith(name, ".lora_b")) {
283	0	replace_all(name, ".lora_b", "");
284	0	if (ab_map.find(name) == ab_map.end()) {
285	0	ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
286	0	} else {
287	0	ab_map[name].b = cur;
288	0	}
289	0	} else if (str_endswith(name, "_norm.weight")) {
290		// TODO: add support for norm vector
291		// for now, we don't really care because most adapters still work fine without it
292	0	continue;
293	0	} else {
294	0	throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
295	0	}
296	0	}
297
298		// get extra buffer types of the CPU
299		// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
300		// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
301	0	std::vector<ggml_backend_buffer_type_t> buft_extra;
302	0	{
303	0	auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
304	0	if (!cpu_dev) {
305	0	throw std::runtime_error(format("%s: no CPU backend found", __func__));
306	0	}
307	0	auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
308
309	0	auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
310	0	ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
311
312	0	if (ggml_backend_dev_get_extra_bufts_fn) {
313	0	ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
314	0	while (extra_bufts && *extra_bufts) {
315	0	buft_extra.emplace_back(*extra_bufts);
316	0	++extra_bufts;
317	0	}
318	0	}
319	0	}
320
321		// add tensors
322	0	for (auto & it : ab_map) {
323	0	const std::string & name = it.first;
324	0	llama_adapter_lora_weight & w = it.second;
325	0	bool is_token_embd = str_endswith(name, "token_embd.weight");
326
327	0	if (!w.a \|\| !w.b) {
328	0	throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
329	0	}
330
331		// device buft and device ctx
332	0	const auto * model_tensor = model.get_tensor(name.c_str());
333	0	if (!model_tensor) {
334	0	throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
335	0	}
336
337	0	auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
338
339		// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
340	0	for (auto & ex : buft_extra) {
341	0	if (ex == buft) {
342	0	LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
343
344	0	auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
345	0	if (!cpu_dev) {
346	0	throw std::runtime_error(format("%s: no CPU backend found", __func__));
347	0	}
348	0	buft = ggml_backend_dev_buffer_type(cpu_dev);
349
350	0	break;
351	0	}
352	0	}
353
354	0	LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
355
356	0	ggml_context * dev_ctx = ctx_for_buft(buft);
357		// validate tensor shape
358	0	if (is_token_embd) {
359		// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
360	0	if (model_tensor->ne[0] != w.b->ne[1] \|\| model_tensor->ne[1] != w.a->ne[1]) {
361	0	throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
362	0	}
363	0	} else {
364	0	if (model_tensor->ne[0] != w.a->ne[0] \|\| model_tensor->ne[1] != w.b->ne[1]) {
365	0	throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
366	0	}
367	0	if (w.a->ne[1] != w.b->ne[0]) {
368	0	throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
369	0	}
370	0	}
371
372		// save tensor to adapter
373	0	ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
374	0	ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
375	0	ggml_set_name(tensor_a, w.a->name);
376	0	ggml_set_name(tensor_b, w.b->name);
377	0	adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
378	0	}
379
380		// allocate tensors / buffers and zero
381	0	{
382	0	adapter.ctxs.reserve(ctx_map.size());
383	0	adapter.bufs.reserve(ctx_map.size());
384	0	for (auto & it : ctx_map) {
385	0	ggml_backend_buffer_type_t buft = it.first;
386	0	ggml_context * ctx_dev = it.second;
387	0	ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
388	0	if (!buf) {
389	0	throw std::runtime_error("failed to allocate buffer for lora adapter\n");
390	0	}
391	0	LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
392	0	adapter.bufs.emplace_back(std::move(buf));
393	0	}
394	0	}
395
396		// set tensor data
397	0	{
398	0	llama_file gguf_file(path_lora, "rb");
399	0	std::vector<uint8_t> read_buf;
400	0	auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
401	0	size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
402	0	size_t size = ggml_nbytes(orig);
403	0	read_buf.resize(size);
404	0	gguf_file.seek(offs, SEEK_SET);
405	0	gguf_file.read_raw(read_buf.data(), size);
406	0	ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
407	0	};
408	0	for (auto & it : adapter.ab_map) {
409	0	auto orig = ab_map[it.first];
410	0	auto dev = it.second;
411	0	set_tensor(orig.a, dev.a);
412	0	set_tensor(orig.b, dev.b);
413	0	}
414	0	}
415
416		// update number of nodes used
417	0	model.n_lora_nodes += adapter.get_n_nodes();
418
419	0	LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
420	0	}
421
422	0	llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
423	0	llama_adapter_lora * adapter = new llama_adapter_lora(*model);
424
425	0	try {
426	0	llama_adapter_lora_init_impl(path_lora, *adapter);
427	0	return adapter;
428	0	} catch (const std::exception & err) {
429	0	LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
430
431	0	delete adapter;
432	0	}
433
434	0	return nullptr;
435	0	}
436
437	0	int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
438	0	const auto & it = adapter->gguf_kv.find(key);
439	0	if (it == adapter->gguf_kv.end()) {
440	0	if (buf_size > 0) {
441	0	buf[0] = '\0';
442	0	}
443	0	return -1;
444	0	}
445	0	return snprintf(buf, buf_size, "%s", it->second.c_str());
446	0	}
447
448	0	int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
449	0	return (int)adapter->gguf_kv.size();
450	0	}
451
452	0	int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
453	0	if (i < 0 \|\| i >= (int)adapter->gguf_kv.size()) {
454	0	if (buf_size > 0) {
455	0	buf[0] = '\0';
456	0	}
457	0	return -1;
458	0	}
459	0	auto it = adapter->gguf_kv.begin();
460	0	std::advance(it, i);
461	0	return snprintf(buf, buf_size, "%s", it->first.c_str());
462	0	}
463
464	0	int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
465	0	if (i < 0 \|\| i >= (int)adapter->gguf_kv.size()) {
466	0	if (buf_size > 0) {
467	0	buf[0] = '\0';
468	0	}
469	0	return -1;
470	0	}
471	0	auto it = adapter->gguf_kv.begin();
472	0	std::advance(it, i);
473	0	return snprintf(buf, buf_size, "%s", it->second.c_str());
474	0	}
475
476	0	void llama_adapter_lora_free(llama_adapter_lora * adapter) {
477		// update number of nodes used
478	0	GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
479	0	adapter->model.n_lora_nodes -= adapter->get_n_nodes();
480
481	0	delete adapter;
482	0	}
483
484	0	uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
485	0	if (!adapter) {
486	0	return 0;
487	0	}
488	0	return adapter->alora_invocation_tokens.size();
489	0	}
490
491	0	const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
492	0	GGML_ASSERT(adapter);
493	0	return adapter->alora_invocation_tokens.data();
494	0	}