/src/llama.cpp/src/llama-adapter.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama.h" |
4 | | |
5 | | #include "ggml-cpp.h" |
6 | | |
7 | | #include <string> |
8 | | #include <unordered_map> |
9 | | #include <vector> |
10 | | |
11 | | // TODO: pimpl |
12 | | |
13 | | // |
14 | | // llama_adapter_cvec |
15 | | // |
16 | | |
17 | | struct llama_adapter_cvec { |
18 | | ggml_tensor * tensor_for(int il) const; |
19 | | |
20 | | ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const; |
21 | | |
22 | | bool apply( |
23 | | const llama_model & model, |
24 | | const float * data, |
25 | | size_t len, |
26 | | int32_t n_embd, |
27 | | int32_t il_start, |
28 | | int32_t il_end); |
29 | | |
30 | | private: |
31 | | bool init(const llama_model & model); |
32 | | |
33 | | int32_t layer_start = -1; |
34 | | int32_t layer_end = -1; |
35 | | |
36 | | std::vector<ggml_context_ptr> ctxs; |
37 | | std::vector<ggml_backend_buffer_ptr> bufs; |
38 | | |
39 | | std::vector<ggml_tensor *> tensors; // per layer |
40 | | }; |
41 | | |
42 | | // |
43 | | // llama_adapter_lora |
44 | | // |
45 | | |
46 | | struct llama_adapter_lora_weight { |
47 | | ggml_tensor * a = nullptr; |
48 | | ggml_tensor * b = nullptr; |
49 | | |
50 | | // get actual scale based on rank and alpha |
51 | 0 | float get_scale(float alpha, float adapter_scale) const { |
52 | 0 | const float rank = (float) b->ne[0]; |
53 | 0 | const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; |
54 | 0 | return scale; |
55 | 0 | } |
56 | | |
57 | 0 | llama_adapter_lora_weight() = default; |
58 | 0 | llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {} |
59 | | }; |
60 | | |
61 | | struct llama_adapter_lora { |
62 | | // map tensor name to lora_a_b |
63 | | std::unordered_map<std::string, llama_adapter_lora_weight> ab_map; |
64 | | |
65 | | std::vector<ggml_context_ptr> ctxs; |
66 | | std::vector<ggml_backend_buffer_ptr> bufs; |
67 | | |
68 | | float alpha; |
69 | | |
70 | | // gguf metadata |
71 | | std::unordered_map<std::string, std::string> gguf_kv; |
72 | | |
73 | | // activated lora (aLoRA) |
74 | | std::vector<llama_token> alora_invocation_tokens; |
75 | | |
76 | 0 | llama_adapter_lora() = default; |
77 | 0 | ~llama_adapter_lora() = default; |
78 | | |
79 | | llama_adapter_lora_weight * get_weight(ggml_tensor * w); |
80 | | }; |
81 | | |
82 | | using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>; |