/src/llama.cpp/src/llama-adapter.cpp
Line | Count | Source |
1 | | #include "llama-adapter.h" |
2 | | |
3 | | #include "llama-impl.h" |
4 | | #include "llama-mmap.h" |
5 | | #include "llama-model.h" |
6 | | |
7 | | #include <map> |
8 | | #include <cassert> |
9 | | #include <sstream> |
10 | | #include <stdexcept> |
11 | | |
12 | | // vec |
13 | | |
14 | 0 | ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { |
15 | 0 | if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { |
16 | 0 | return nullptr; |
17 | 0 | } |
18 | | |
19 | 0 | return tensors[il]; |
20 | 0 | } |
21 | | |
22 | 0 | ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const { |
23 | 0 | ggml_tensor * layer_dir = tensor_for(il); |
24 | 0 | if (layer_dir != nullptr) { |
25 | 0 | cur = ggml_add(ctx, cur, layer_dir); |
26 | 0 | } |
27 | |
|
28 | 0 | return cur; |
29 | 0 | } |
30 | | |
31 | 0 | bool llama_adapter_cvec::init(const llama_model & model) { |
32 | 0 | const auto & hparams = model.hparams; |
33 | |
|
34 | 0 | GGML_ASSERT(tensors.empty()); |
35 | 0 | GGML_ASSERT(ctxs.empty()); |
36 | 0 | GGML_ASSERT(bufs.empty()); |
37 | | |
38 | | // create a context for each buffer type |
39 | 0 | std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
40 | 0 | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
41 | 0 | auto it = ctx_map.find(buft); |
42 | 0 | if (it == ctx_map.end()) { |
43 | 0 | ggml_init_params params = { |
44 | 0 | /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), |
45 | 0 | /*.mem_buffer =*/ NULL, |
46 | 0 | /*.no_alloc =*/ true, |
47 | 0 | }; |
48 | |
|
49 | 0 | ggml_context * ctx = ggml_init(params); |
50 | 0 | if (!ctx) { |
51 | 0 | return nullptr; |
52 | 0 | } |
53 | | |
54 | 0 | ctx_map[buft] = ctx; |
55 | 0 | ctxs.emplace_back(ctx); |
56 | |
|
57 | 0 | return ctx; |
58 | 0 | } |
59 | | |
60 | 0 | return it->second; |
61 | 0 | }; |
62 | | |
63 | | // make tensors |
64 | 0 | tensors.reserve(hparams.n_layer); |
65 | 0 | tensors.push_back(nullptr); // there's never a tensor for layer 0 |
66 | 0 | for (size_t il = 1; il < hparams.n_layer; il++) { |
67 | 0 | ggml_backend_buffer_type_t buft = model.select_buft(il); |
68 | 0 | ggml_context * ctx = ctx_for_buft(buft); |
69 | 0 | if (!ctx) { |
70 | 0 | LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); |
71 | 0 | return false; |
72 | 0 | } |
73 | 0 | ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); |
74 | 0 | tensors.push_back(tensor); |
75 | 0 | } |
76 | | |
77 | | // allocate tensors / buffers and zero |
78 | 0 | bufs.reserve(ctx_map.size()); |
79 | 0 | for (auto it : ctx_map) { |
80 | 0 | ggml_backend_buffer_type_t buft = it.first; |
81 | 0 | ggml_context * ctx = it.second; |
82 | 0 | ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); |
83 | 0 | if (!buf) { |
84 | 0 | LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); |
85 | 0 | return false; |
86 | 0 | } |
87 | 0 | ggml_backend_buffer_clear(buf, 0); |
88 | 0 | bufs.emplace_back(buf); |
89 | 0 | } |
90 | | |
91 | 0 | return true; |
92 | 0 | } |
93 | | |
94 | | bool llama_adapter_cvec::apply( |
95 | | const llama_model & model, |
96 | | const float * data, |
97 | | size_t len, |
98 | | int32_t n_embd, |
99 | | int32_t il_start, |
100 | 0 | int32_t il_end) { |
101 | 0 | const auto & hparams = model.hparams; |
102 | |
|
103 | 0 | if (data == nullptr) { |
104 | | // disable the current control vector (but leave allocated for later) |
105 | 0 | layer_start = -1; |
106 | 0 | layer_end = -1; |
107 | 0 | return true; |
108 | 0 | } |
109 | | |
110 | 0 | if (n_embd != (int) hparams.n_embd) { |
111 | 0 | LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); |
112 | 0 | return false; |
113 | 0 | } |
114 | | |
115 | 0 | if (tensors.empty()) { |
116 | 0 | if (!init(model)) { |
117 | 0 | return false; |
118 | 0 | } |
119 | 0 | } |
120 | | |
121 | 0 | layer_start = il_start; |
122 | 0 | layer_end = il_end; |
123 | |
|
124 | 0 | for (size_t il = 1; il < hparams.n_layer; il++) { |
125 | 0 | assert(tensors[il] != nullptr); |
126 | |
|
127 | 0 | const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present |
128 | 0 | if (off + n_embd <= len) { |
129 | 0 | ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il])); |
130 | 0 | } |
131 | 0 | } |
132 | |
|
133 | 0 | return true; |
134 | 0 | } |
135 | | |
136 | | // lora |
137 | | |
138 | 0 | llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { |
139 | 0 | const std::string name(w->name); |
140 | |
|
141 | 0 | const auto pos = ab_map.find(name); |
142 | 0 | if (pos != ab_map.end()) { |
143 | 0 | return &pos->second; |
144 | 0 | } |
145 | | |
146 | 0 | return nullptr; |
147 | 0 | } |
148 | | |
149 | 0 | static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { |
150 | 0 | LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); |
151 | |
|
152 | 0 | ggml_context * ctx_init; |
153 | 0 | gguf_init_params meta_gguf_params = { |
154 | 0 | /* .no_alloc = */ true, |
155 | 0 | /* .ctx = */ &ctx_init, |
156 | 0 | }; |
157 | |
|
158 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; |
159 | 0 | if (!ctx_gguf) { |
160 | 0 | throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); |
161 | 0 | } |
162 | | |
163 | 0 | ggml_context_ptr ctx { ctx_init }; |
164 | | |
165 | | // check metadata |
166 | 0 | { |
167 | 0 | const gguf_context * gguf_ctx = ctx_gguf.get(); |
168 | |
|
169 | 0 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__); |
170 | | |
171 | | // get metadata as string |
172 | 0 | for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) { |
173 | 0 | gguf_type type = gguf_get_kv_type(gguf_ctx, i); |
174 | 0 | const std::string type_name = |
175 | 0 | type == GGUF_TYPE_ARRAY |
176 | 0 | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i)) |
177 | 0 | : gguf_type_name(type); |
178 | 0 | const char * name = gguf_get_key(gguf_ctx, i); |
179 | 0 | const std::string value = gguf_kv_to_str(gguf_ctx, i); |
180 | |
|
181 | 0 | if (type != GGUF_TYPE_ARRAY) { |
182 | 0 | adapter.gguf_kv.emplace(name, value); |
183 | 0 | } |
184 | |
|
185 | 0 | const size_t MAX_VALUE_LEN = 40; |
186 | 0 | std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value; |
187 | 0 | replace_all(print_value, "\n", "\\n"); |
188 | |
|
189 | 0 | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str()); |
190 | 0 | } |
191 | |
|
192 | 0 | auto get_kv_str = [&](const std::string & key) -> std::string { |
193 | 0 | int id = gguf_find_key(gguf_ctx, key.c_str()); |
194 | 0 | return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id)); |
195 | 0 | }; |
196 | 0 | auto get_kv_f32 = [&](const std::string & key) -> float { |
197 | 0 | int id = gguf_find_key(gguf_ctx, key.c_str()); |
198 | 0 | return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id); |
199 | 0 | }; |
200 | 0 | LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); |
201 | |
|
202 | 0 | auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); |
203 | 0 | if (general_type != "adapter") { |
204 | 0 | throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); |
205 | 0 | } |
206 | | |
207 | 0 | auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); |
208 | 0 | auto general_arch = llm_arch_from_string(general_arch_str); |
209 | 0 | if (general_arch != model.arch) { |
210 | 0 | throw std::runtime_error("model arch and LoRA arch mismatch"); |
211 | 0 | } |
212 | | |
213 | 0 | auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); |
214 | 0 | if (adapter_type != "lora") { |
215 | 0 | throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); |
216 | 0 | } |
217 | | |
218 | 0 | adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); |
219 | | |
220 | | // parse alora invocation sequence vector |
221 | 0 | const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS); |
222 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), key.c_str()); |
223 | 0 | if (kid >= 0) { |
224 | 0 | if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) { |
225 | 0 | throw std::runtime_error("invalid gguf type for " + key); |
226 | 0 | } |
227 | 0 | const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid); |
228 | 0 | if (arr_type != GGUF_TYPE_UINT32) { |
229 | 0 | throw std::runtime_error("invalid gguf element type for " + key); |
230 | 0 | } |
231 | 0 | const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid); |
232 | 0 | const void * data = gguf_get_arr_data(ctx_gguf.get(), kid); |
233 | 0 | adapter.alora_invocation_tokens.resize(seq_len); |
234 | 0 | std::copy( |
235 | 0 | (const llama_token *)data, |
236 | 0 | (const llama_token *)data + seq_len, |
237 | 0 | adapter.alora_invocation_tokens.begin()); |
238 | 0 | } |
239 | 0 | } |
240 | | |
241 | 0 | int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); |
242 | | |
243 | | // contexts for each buffer type |
244 | 0 | std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
245 | 0 | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
246 | 0 | auto it = ctx_map.find(buft); |
247 | 0 | if (it == ctx_map.end()) { |
248 | | // add a new context |
249 | 0 | ggml_init_params params = { |
250 | 0 | /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), |
251 | 0 | /*.mem_buffer =*/ NULL, |
252 | 0 | /*.no_alloc =*/ true, |
253 | 0 | }; |
254 | 0 | ggml_context * buft_ctx = ggml_init(params); |
255 | 0 | if (!buft_ctx) { |
256 | 0 | return nullptr; |
257 | 0 | } |
258 | 0 | ctx_map[buft] = buft_ctx; |
259 | 0 | adapter.ctxs.emplace_back(buft_ctx); |
260 | 0 | return buft_ctx; |
261 | 0 | }; |
262 | 0 | return it->second; |
263 | 0 | }; |
264 | | |
265 | | // bundle lora_a and lora_b into pairs |
266 | 0 | std::map<std::string, llama_adapter_lora_weight> ab_map; |
267 | 0 | auto str_endswith = [](const std::string & str, const std::string & suffix) { |
268 | 0 | return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; |
269 | 0 | }; |
270 | |
|
271 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { |
272 | 0 | std::string name(cur->name); |
273 | 0 | if (str_endswith(name, ".lora_a")) { |
274 | 0 | replace_all(name, ".lora_a", ""); |
275 | 0 | if (ab_map.find(name) == ab_map.end()) { |
276 | 0 | ab_map[name] = llama_adapter_lora_weight(cur, nullptr); |
277 | 0 | } else { |
278 | 0 | ab_map[name].a = cur; |
279 | 0 | } |
280 | 0 | } else if (str_endswith(name, ".lora_b")) { |
281 | 0 | replace_all(name, ".lora_b", ""); |
282 | 0 | if (ab_map.find(name) == ab_map.end()) { |
283 | 0 | ab_map[name] = llama_adapter_lora_weight(nullptr, cur); |
284 | 0 | } else { |
285 | 0 | ab_map[name].b = cur; |
286 | 0 | } |
287 | 0 | } else if (str_endswith(name, "_norm.weight")) { |
288 | | // TODO: add support for norm vector |
289 | | // for now, we don't really care because most adapters still work fine without it |
290 | 0 | continue; |
291 | 0 | } else { |
292 | 0 | throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); |
293 | 0 | } |
294 | 0 | } |
295 | | |
296 | | // get extra buffer types of the CPU |
297 | | // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future |
298 | | // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948 |
299 | 0 | std::vector<ggml_backend_buffer_type_t> buft_extra; |
300 | 0 | { |
301 | 0 | auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
302 | 0 | if (!cpu_dev) { |
303 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
304 | 0 | } |
305 | 0 | auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); |
306 | |
|
307 | 0 | auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) |
308 | 0 | ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); |
309 | |
|
310 | 0 | if (ggml_backend_dev_get_extra_bufts_fn) { |
311 | 0 | ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); |
312 | 0 | while (extra_bufts && *extra_bufts) { |
313 | 0 | buft_extra.emplace_back(*extra_bufts); |
314 | 0 | ++extra_bufts; |
315 | 0 | } |
316 | 0 | } |
317 | 0 | } |
318 | | |
319 | | // add tensors |
320 | 0 | for (auto & it : ab_map) { |
321 | 0 | const std::string & name = it.first; |
322 | 0 | llama_adapter_lora_weight & w = it.second; |
323 | 0 | bool is_token_embd = str_endswith(name, "token_embd.weight"); |
324 | |
|
325 | 0 | if (!w.a || !w.b) { |
326 | 0 | throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); |
327 | 0 | } |
328 | | |
329 | | // device buft and device ctx |
330 | 0 | const auto * model_tensor = model.get_tensor(name.c_str()); |
331 | 0 | if (!model_tensor) { |
332 | 0 | throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); |
333 | 0 | } |
334 | | |
335 | 0 | auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer); |
336 | | |
337 | | // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case |
338 | 0 | for (auto & ex : buft_extra) { |
339 | 0 | if (ex == buft) { |
340 | 0 | LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); |
341 | |
|
342 | 0 | auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
343 | 0 | if (!cpu_dev) { |
344 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
345 | 0 | } |
346 | 0 | buft = ggml_backend_dev_buffer_type(cpu_dev); |
347 | |
|
348 | 0 | break; |
349 | 0 | } |
350 | 0 | } |
351 | | |
352 | 0 | LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); |
353 | |
|
354 | 0 | ggml_context * dev_ctx = ctx_for_buft(buft); |
355 | | // validate tensor shape |
356 | 0 | if (is_token_embd) { |
357 | | // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() |
358 | 0 | if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) { |
359 | 0 | throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); |
360 | 0 | } |
361 | 0 | } else { |
362 | 0 | if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { |
363 | 0 | throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); |
364 | 0 | } |
365 | 0 | if (w.a->ne[1] != w.b->ne[0]) { |
366 | 0 | throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); |
367 | 0 | } |
368 | 0 | } |
369 | | |
370 | | // save tensor to adapter |
371 | 0 | ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); |
372 | 0 | ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); |
373 | 0 | ggml_set_name(tensor_a, w.a->name); |
374 | 0 | ggml_set_name(tensor_b, w.b->name); |
375 | 0 | adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b); |
376 | 0 | } |
377 | | |
378 | | // allocate tensors / buffers and zero |
379 | 0 | { |
380 | 0 | adapter.ctxs.reserve(ctx_map.size()); |
381 | 0 | adapter.bufs.reserve(ctx_map.size()); |
382 | 0 | for (auto & it : ctx_map) { |
383 | 0 | ggml_backend_buffer_type_t buft = it.first; |
384 | 0 | ggml_context * ctx_dev = it.second; |
385 | 0 | ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; |
386 | 0 | if (!buf) { |
387 | 0 | throw std::runtime_error("failed to allocate buffer for lora adapter\n"); |
388 | 0 | } |
389 | 0 | LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); |
390 | 0 | adapter.bufs.emplace_back(std::move(buf)); |
391 | 0 | } |
392 | 0 | } |
393 | | |
394 | | // set tensor data |
395 | 0 | { |
396 | 0 | llama_file gguf_file(path_lora, "rb"); |
397 | 0 | std::vector<uint8_t> read_buf; |
398 | 0 | auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) { |
399 | 0 | size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); |
400 | 0 | size_t size = ggml_nbytes(orig); |
401 | 0 | read_buf.resize(size); |
402 | 0 | gguf_file.seek(offs, SEEK_SET); |
403 | 0 | gguf_file.read_raw(read_buf.data(), size); |
404 | 0 | ggml_backend_tensor_set(dev, read_buf.data(), 0, size); |
405 | 0 | }; |
406 | 0 | for (auto & it : adapter.ab_map) { |
407 | 0 | auto orig = ab_map[it.first]; |
408 | 0 | auto dev = it.second; |
409 | 0 | set_tensor(orig.a, dev.a); |
410 | 0 | set_tensor(orig.b, dev.b); |
411 | 0 | } |
412 | 0 | } |
413 | |
|
414 | 0 | LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); |
415 | 0 | } |
416 | | |
417 | 0 | llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { |
418 | 0 | llama_adapter_lora * adapter = new llama_adapter_lora(); |
419 | |
|
420 | 0 | try { |
421 | 0 | llama_adapter_lora_init_impl(*model, path_lora, *adapter); |
422 | 0 | return adapter; |
423 | 0 | } catch (const std::exception & err) { |
424 | 0 | LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); |
425 | |
|
426 | 0 | delete adapter; |
427 | 0 | } |
428 | | |
429 | 0 | return nullptr; |
430 | 0 | } |
431 | | |
432 | 0 | int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) { |
433 | 0 | const auto & it = adapter->gguf_kv.find(key); |
434 | 0 | if (it == adapter->gguf_kv.end()) { |
435 | 0 | if (buf_size > 0) { |
436 | 0 | buf[0] = '\0'; |
437 | 0 | } |
438 | 0 | return -1; |
439 | 0 | } |
440 | 0 | return snprintf(buf, buf_size, "%s", it->second.c_str()); |
441 | 0 | } |
442 | | |
443 | 0 | int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) { |
444 | 0 | return (int)adapter->gguf_kv.size(); |
445 | 0 | } |
446 | | |
447 | 0 | int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) { |
448 | 0 | if (i < 0 || i >= (int)adapter->gguf_kv.size()) { |
449 | 0 | if (buf_size > 0) { |
450 | 0 | buf[0] = '\0'; |
451 | 0 | } |
452 | 0 | return -1; |
453 | 0 | } |
454 | 0 | auto it = adapter->gguf_kv.begin(); |
455 | 0 | std::advance(it, i); |
456 | 0 | return snprintf(buf, buf_size, "%s", it->first.c_str()); |
457 | 0 | } |
458 | | |
459 | 0 | int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) { |
460 | 0 | if (i < 0 || i >= (int)adapter->gguf_kv.size()) { |
461 | 0 | if (buf_size > 0) { |
462 | 0 | buf[0] = '\0'; |
463 | 0 | } |
464 | 0 | return -1; |
465 | 0 | } |
466 | 0 | auto it = adapter->gguf_kv.begin(); |
467 | 0 | std::advance(it, i); |
468 | 0 | return snprintf(buf, buf_size, "%s", it->second.c_str()); |
469 | 0 | } |
470 | | |
471 | 0 | void llama_adapter_lora_free(llama_adapter_lora * adapter) { |
472 | 0 | delete adapter; |
473 | 0 | } |
474 | | |
475 | 0 | uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) { |
476 | 0 | if (!adapter) { |
477 | 0 | return 0; |
478 | 0 | } |
479 | 0 | return adapter->alora_invocation_tokens.size(); |
480 | 0 | } |
481 | | |
482 | 0 | const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) { |
483 | 0 | GGML_ASSERT(adapter); |
484 | 0 | return adapter->alora_invocation_tokens.data(); |
485 | 0 | } |