Coverage Report

Created: 2026-01-11 07:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama-adapter.cpp
Line
Count
Source
1
#include "llama-adapter.h"
2
3
#include "llama-impl.h"
4
#include "llama-mmap.h"
5
#include "llama-model.h"
6
7
#include <map>
8
#include <cassert>
9
#include <sstream>
10
#include <stdexcept>
11
12
// vec
13
14
0
ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
0
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
0
        return nullptr;
17
0
    }
18
19
0
    return tensors[il];
20
0
}
21
22
0
ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
23
0
    ggml_tensor * layer_dir = tensor_for(il);
24
0
    if (layer_dir != nullptr) {
25
0
        cur = ggml_add(ctx, cur, layer_dir);
26
0
    }
27
28
0
    return cur;
29
0
}
30
31
0
bool llama_adapter_cvec::init(const llama_model & model) {
32
0
    const auto & hparams = model.hparams;
33
34
0
    GGML_ASSERT(tensors.empty());
35
0
    GGML_ASSERT(ctxs.empty());
36
0
    GGML_ASSERT(bufs.empty());
37
38
    // create a context for each buffer type
39
0
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
40
0
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41
0
        auto it = ctx_map.find(buft);
42
0
        if (it == ctx_map.end()) {
43
0
            ggml_init_params params = {
44
0
                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
45
0
                /*.mem_buffer =*/ NULL,
46
0
                /*.no_alloc   =*/ true,
47
0
            };
48
49
0
            ggml_context * ctx = ggml_init(params);
50
0
            if (!ctx) {
51
0
                return nullptr;
52
0
            }
53
54
0
            ctx_map[buft] = ctx;
55
0
            ctxs.emplace_back(ctx);
56
57
0
            return ctx;
58
0
        }
59
60
0
        return it->second;
61
0
    };
62
63
    // make tensors
64
0
    tensors.reserve(hparams.n_layer);
65
0
    tensors.push_back(nullptr); // there's never a tensor for layer 0
66
0
    for (size_t il = 1; il < hparams.n_layer; il++) {
67
0
        ggml_backend_buffer_type_t buft = model.select_buft(il);
68
0
        ggml_context * ctx = ctx_for_buft(buft);
69
0
        if (!ctx) {
70
0
            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71
0
            return false;
72
0
        }
73
0
        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
74
0
        tensors.push_back(tensor);
75
0
    }
76
77
    // allocate tensors / buffers and zero
78
0
    bufs.reserve(ctx_map.size());
79
0
    for (auto it : ctx_map) {
80
0
        ggml_backend_buffer_type_t buft = it.first;
81
0
        ggml_context * ctx = it.second;
82
0
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83
0
        if (!buf) {
84
0
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85
0
            return false;
86
0
        }
87
0
        ggml_backend_buffer_clear(buf, 0);
88
0
        bufs.emplace_back(buf);
89
0
    }
90
91
0
    return true;
92
0
}
93
94
bool llama_adapter_cvec::apply(
95
        const llama_model & model,
96
        const float * data,
97
        size_t len,
98
        int32_t n_embd,
99
        int32_t il_start,
100
0
        int32_t il_end) {
101
0
    const auto & hparams = model.hparams;
102
103
0
    if (data == nullptr) {
104
        // disable the current control vector (but leave allocated for later)
105
0
        layer_start = -1;
106
0
        layer_end   = -1;
107
0
        return true;
108
0
    }
109
110
0
    if (n_embd != (int) hparams.n_embd) {
111
0
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
0
        return false;
113
0
    }
114
115
0
    if (tensors.empty()) {
116
0
        if (!init(model)) {
117
0
            return false;
118
0
        }
119
0
    }
120
121
0
    layer_start = il_start;
122
0
    layer_end   = il_end;
123
124
0
    for (size_t il = 1; il < hparams.n_layer; il++) {
125
0
        assert(tensors[il] != nullptr);
126
127
0
        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128
0
        if (off + n_embd <= len) {
129
0
            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
130
0
        }
131
0
    }
132
133
0
    return true;
134
0
}
135
136
// lora
137
138
0
llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139
0
    const std::string name(w->name);
140
141
0
    const auto pos = ab_map.find(name);
142
0
    if (pos != ab_map.end()) {
143
0
        return &pos->second;
144
0
    }
145
146
0
    return nullptr;
147
0
}
148
149
0
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
150
0
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
152
0
    llama_model & model = adapter.model;
153
154
0
    ggml_context * ctx_init;
155
0
    gguf_init_params meta_gguf_params = {
156
0
        /* .no_alloc = */ true,
157
0
        /* .ctx      = */ &ctx_init,
158
0
    };
159
160
0
    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
161
0
    if (!ctx_gguf) {
162
0
        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
163
0
    }
164
165
0
    ggml_context_ptr ctx { ctx_init };
166
167
    // check metadata
168
0
    {
169
0
        const gguf_context * gguf_ctx = ctx_gguf.get();
170
171
0
        LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
172
173
        // get metadata as string
174
0
        for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
175
0
            gguf_type type = gguf_get_kv_type(gguf_ctx, i);
176
0
            const std::string type_name =
177
0
                type == GGUF_TYPE_ARRAY
178
0
                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
179
0
                : gguf_type_name(type);
180
0
            const char * name = gguf_get_key(gguf_ctx, i);
181
0
            const std::string value = gguf_kv_to_str(gguf_ctx, i);
182
183
0
            if (type != GGUF_TYPE_ARRAY) {
184
0
                adapter.gguf_kv.emplace(name, value);
185
0
            }
186
187
0
            const size_t MAX_VALUE_LEN = 40;
188
0
            std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
189
0
            replace_all(print_value, "\n", "\\n");
190
191
0
            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
192
0
        }
193
194
0
        auto get_kv_str = [&](const std::string & key) -> std::string {
195
0
            int id = gguf_find_key(gguf_ctx, key.c_str());
196
0
            return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
197
0
        };
198
0
        auto get_kv_f32 = [&](const std::string & key) -> float {
199
0
            int id = gguf_find_key(gguf_ctx, key.c_str());
200
0
            return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
201
0
        };
202
0
        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
203
204
0
        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
205
0
        if (general_type != "adapter") {
206
0
            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
207
0
        }
208
209
0
        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
210
0
        auto general_arch = llm_arch_from_string(general_arch_str);
211
0
        if (general_arch != model.arch) {
212
0
            throw std::runtime_error("model arch and LoRA arch mismatch");
213
0
        }
214
215
0
        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
216
0
        if (adapter_type != "lora") {
217
0
            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
218
0
        }
219
220
0
        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
221
222
        // parse alora invocation sequence vector
223
0
        const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
224
0
        const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
225
0
        if (kid >= 0) {
226
0
            if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
227
0
                throw std::runtime_error("invalid gguf type for " + key);
228
0
            }
229
0
            const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
230
0
            if (arr_type != GGUF_TYPE_UINT32) {
231
0
                throw std::runtime_error("invalid gguf element type for " + key);
232
0
            }
233
0
            const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
234
0
            const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
235
0
            adapter.alora_invocation_tokens.resize(seq_len);
236
0
            std::copy(
237
0
                (const llama_token *)data,
238
0
                (const llama_token *)data + seq_len,
239
0
                adapter.alora_invocation_tokens.begin());
240
0
        }
241
0
    }
242
243
0
    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
244
245
    // contexts for each buffer type
246
0
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
247
0
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
248
0
        auto it = ctx_map.find(buft);
249
0
        if (it == ctx_map.end()) {
250
            // add a new context
251
0
            ggml_init_params params = {
252
0
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
253
0
                /*.mem_buffer =*/ NULL,
254
0
                /*.no_alloc   =*/ true,
255
0
            };
256
0
            ggml_context * buft_ctx = ggml_init(params);
257
0
            if (!buft_ctx) {
258
0
                return nullptr;
259
0
            }
260
0
            ctx_map[buft] = buft_ctx;
261
0
            adapter.ctxs.emplace_back(buft_ctx);
262
0
            return buft_ctx;
263
0
        };
264
0
        return it->second;
265
0
    };
266
267
    // bundle lora_a and lora_b into pairs
268
0
    std::map<std::string, llama_adapter_lora_weight> ab_map;
269
0
    auto str_endswith = [](const std::string & str, const std::string & suffix) {
270
0
        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
271
0
    };
272
273
0
    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
274
0
        std::string name(cur->name);
275
0
        if (str_endswith(name, ".lora_a")) {
276
0
            replace_all(name, ".lora_a", "");
277
0
            if (ab_map.find(name) == ab_map.end()) {
278
0
                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
279
0
            } else {
280
0
                ab_map[name].a = cur;
281
0
            }
282
0
        } else if (str_endswith(name, ".lora_b")) {
283
0
            replace_all(name, ".lora_b", "");
284
0
            if (ab_map.find(name) == ab_map.end()) {
285
0
                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
286
0
            } else {
287
0
                ab_map[name].b = cur;
288
0
            }
289
0
        } else if (str_endswith(name, "_norm.weight")) {
290
            // TODO: add support for norm vector
291
            // for now, we don't really care because most adapters still work fine without it
292
0
            continue;
293
0
        } else {
294
0
            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
295
0
        }
296
0
    }
297
298
    // get extra buffer types of the CPU
299
    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
300
    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
301
0
    std::vector<ggml_backend_buffer_type_t> buft_extra;
302
0
    {
303
0
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
304
0
        if (!cpu_dev) {
305
0
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
306
0
        }
307
0
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
308
309
0
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
310
0
            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
311
312
0
        if (ggml_backend_dev_get_extra_bufts_fn) {
313
0
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
314
0
            while (extra_bufts && *extra_bufts) {
315
0
                buft_extra.emplace_back(*extra_bufts);
316
0
                ++extra_bufts;
317
0
            }
318
0
        }
319
0
    }
320
321
    // add tensors
322
0
    for (auto & it : ab_map) {
323
0
        const std::string & name = it.first;
324
0
        llama_adapter_lora_weight & w = it.second;
325
0
        bool is_token_embd = str_endswith(name, "token_embd.weight");
326
327
0
        if (!w.a || !w.b) {
328
0
            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
329
0
        }
330
331
        // device buft and device ctx
332
0
        const auto * model_tensor = model.get_tensor(name.c_str());
333
0
        if (!model_tensor) {
334
0
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
335
0
        }
336
337
0
        auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
338
339
        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
340
0
        for (auto & ex : buft_extra) {
341
0
            if (ex == buft) {
342
0
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
343
344
0
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
345
0
                if (!cpu_dev) {
346
0
                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
347
0
                }
348
0
                buft = ggml_backend_dev_buffer_type(cpu_dev);
349
350
0
                break;
351
0
            }
352
0
        }
353
354
0
        LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
355
356
0
        ggml_context * dev_ctx = ctx_for_buft(buft);
357
        // validate tensor shape
358
0
        if (is_token_embd) {
359
            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
360
0
            if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
361
0
                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
362
0
            }
363
0
        } else {
364
0
            if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
365
0
                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
366
0
            }
367
0
            if (w.a->ne[1] != w.b->ne[0]) {
368
0
                throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
369
0
            }
370
0
        }
371
372
        // save tensor to adapter
373
0
        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
374
0
        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
375
0
        ggml_set_name(tensor_a, w.a->name);
376
0
        ggml_set_name(tensor_b, w.b->name);
377
0
        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
378
0
    }
379
380
    // allocate tensors / buffers and zero
381
0
    {
382
0
        adapter.ctxs.reserve(ctx_map.size());
383
0
        adapter.bufs.reserve(ctx_map.size());
384
0
        for (auto & it : ctx_map) {
385
0
            ggml_backend_buffer_type_t buft = it.first;
386
0
            ggml_context * ctx_dev = it.second;
387
0
            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
388
0
            if (!buf) {
389
0
                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
390
0
            }
391
0
            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
392
0
            adapter.bufs.emplace_back(std::move(buf));
393
0
        }
394
0
    }
395
396
    // set tensor data
397
0
    {
398
0
        llama_file gguf_file(path_lora, "rb");
399
0
        std::vector<uint8_t> read_buf;
400
0
        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
401
0
            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
402
0
            size_t size = ggml_nbytes(orig);
403
0
            read_buf.resize(size);
404
0
            gguf_file.seek(offs, SEEK_SET);
405
0
            gguf_file.read_raw(read_buf.data(), size);
406
0
            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
407
0
        };
408
0
        for (auto & it : adapter.ab_map) {
409
0
            auto orig = ab_map[it.first];
410
0
            auto dev  = it.second;
411
0
            set_tensor(orig.a, dev.a);
412
0
            set_tensor(orig.b, dev.b);
413
0
        }
414
0
    }
415
416
    // update number of nodes used
417
0
    model.n_lora_nodes += adapter.get_n_nodes();
418
419
0
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
420
0
}
421
422
0
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
423
0
    llama_adapter_lora * adapter = new llama_adapter_lora(*model);
424
425
0
    try {
426
0
        llama_adapter_lora_init_impl(path_lora, *adapter);
427
0
        return adapter;
428
0
    } catch (const std::exception & err) {
429
0
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
430
431
0
        delete adapter;
432
0
    }
433
434
0
    return nullptr;
435
0
}
436
437
0
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
438
0
    const auto & it = adapter->gguf_kv.find(key);
439
0
    if (it == adapter->gguf_kv.end()) {
440
0
        if (buf_size > 0) {
441
0
            buf[0] = '\0';
442
0
        }
443
0
        return -1;
444
0
    }
445
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
446
0
}
447
448
0
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
449
0
    return (int)adapter->gguf_kv.size();
450
0
}
451
452
0
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
453
0
    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
454
0
        if (buf_size > 0) {
455
0
            buf[0] = '\0';
456
0
        }
457
0
        return -1;
458
0
    }
459
0
    auto it = adapter->gguf_kv.begin();
460
0
    std::advance(it, i);
461
0
    return snprintf(buf, buf_size, "%s", it->first.c_str());
462
0
}
463
464
0
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
465
0
    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
466
0
        if (buf_size > 0) {
467
0
            buf[0] = '\0';
468
0
        }
469
0
        return -1;
470
0
    }
471
0
    auto it = adapter->gguf_kv.begin();
472
0
    std::advance(it, i);
473
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
474
0
}
475
476
0
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
477
    // update number of nodes used
478
0
    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
479
0
    adapter->model.n_lora_nodes -= adapter->get_n_nodes();
480
481
0
    delete adapter;
482
0
}
483
484
0
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
485
0
    if (!adapter) {
486
0
        return 0;
487
0
    }
488
0
    return adapter->alora_invocation_tokens.size();
489
0
}
490
491
0
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
492
0
    GGML_ASSERT(adapter);
493
0
    return adapter->alora_invocation_tokens.data();
494
0
}