/src/llama.cpp/src/llama-model-loader.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama.h" |
4 | | |
5 | | #include "llama-impl.h" |
6 | | #include "llama-arch.h" |
7 | | #include "llama-hparams.h" |
8 | | #include "llama-mmap.h" |
9 | | |
10 | | #include "ggml-cpp.h" |
11 | | |
12 | | #include <cstddef> |
13 | | #include <cstring> |
14 | | #include <map> |
15 | | #include <stdexcept> |
16 | | #include <unordered_map> |
17 | | |
18 | | using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>; |
19 | | |
20 | | // lists of buffer types used for each layer |
21 | | using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>; |
22 | | |
23 | | enum llama_fver { |
24 | | GGUF_FILE_VERSION_V1 = 1, |
25 | | GGUF_FILE_VERSION_V2 = 2, |
26 | | GGUF_FILE_VERSION_V3 = 3, |
27 | | }; |
28 | | |
29 | | const char * llama_file_version_name(llama_fver version); |
30 | | |
31 | | struct llama_model_loader { |
32 | | // Holds information on a model weight |
33 | | struct llama_tensor_weight { |
34 | | uint16_t idx; // source file index |
35 | | size_t offs; // tensor data offset in the original file |
36 | | |
37 | | ggml_tensor * tensor; |
38 | | |
39 | 0 | llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { |
40 | 0 | const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); |
41 | 0 | if (tensor_idx < 0) { |
42 | 0 | throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); |
43 | 0 | } |
44 | | |
45 | 0 | offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); |
46 | 0 | if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) { |
47 | 0 | throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); |
48 | 0 | } |
49 | 0 | } |
50 | | }; |
51 | | |
52 | | // custom comparator to sort weights more nicely by layer |
53 | | struct weight_name_comparer { |
54 | 0 | bool operator()(const std::string & a, const std::string & b) const { |
55 | 0 | int a_layer = -1; |
56 | 0 | int b_layer = -1; |
57 | 0 | sscanf(a.c_str(), "blk.%d.", &a_layer); |
58 | 0 | sscanf(b.c_str(), "blk.%d.", &b_layer); |
59 | 0 | if (a_layer != b_layer) { |
60 | 0 | return a_layer < b_layer; |
61 | 0 | } |
62 | 0 | return a < b; |
63 | 0 | } |
64 | | }; |
65 | | |
66 | | static const int TENSOR_NOT_REQUIRED = 1 << 0; |
67 | | static const int TENSOR_DUPLICATED = 1 << 1; |
68 | | static const int TENSOR_SKIP = 1 << 2; |
69 | | static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3; |
70 | | |
71 | | int n_kv = 0; |
72 | | int n_tensors = 0; |
73 | | int n_created = 0; |
74 | | |
75 | | uint64_t n_elements = 0; |
76 | | size_t n_bytes = 0; |
77 | | |
78 | | bool use_mmap = false; |
79 | | bool use_direct_io = false; |
80 | | bool check_tensors; |
81 | | bool no_alloc; |
82 | | |
83 | | llama_files files; |
84 | | llama_ftype ftype; |
85 | | llama_fver fver; |
86 | | |
87 | | llama_mmaps mappings; |
88 | | |
89 | | std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map; |
90 | | std::unordered_map<std::string, llama_model_kv_override> kv_overrides; |
91 | | const llama_model_tensor_buft_override * tensor_buft_overrides; |
92 | | |
93 | | gguf_context_ptr metadata_ptr; |
94 | | struct gguf_context * metadata; // either metadata_ptr.get() or externally set |
95 | | llama_model_set_tensor_data_t set_tensor_data; |
96 | | void * set_tensor_data_ud; |
97 | | std::vector<ggml_context_ptr> contexts; |
98 | | |
99 | | std::string arch_name; |
100 | | LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); |
101 | | |
102 | | size_t size_done = 0; |
103 | | size_t size_data = 0; |
104 | | std::vector<std::pair<size_t, size_t>> mmaps_used; |
105 | | |
106 | | // define a comparator for the buft -> ctx map to ensure that the order is well-defined: |
107 | | struct ggml_backend_buft_comparator { |
108 | 0 | bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const { |
109 | 0 | return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0; |
110 | 0 | } |
111 | | }; |
112 | | |
113 | | std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map; |
114 | | |
115 | | // track tensors that had to be moved for debugging: |
116 | | size_t n_tensors_moved = 0; |
117 | | std::string first_tensor_moved_name; |
118 | | std::string first_tensor_moved_type_name; |
119 | | ggml_backend_buffer_type_t first_moved_from_buft = nullptr; |
120 | | ggml_backend_buffer_type_t first_moved_to_buft = nullptr; |
121 | | |
122 | | llama_model_loader( |
123 | | struct gguf_context * metadata, |
124 | | llama_model_set_tensor_data_t set_tensor_data, |
125 | | void * set_tensor_data_ud, |
126 | | const std::string & fname, |
127 | | std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme |
128 | | FILE * file, |
129 | | bool use_mmap, |
130 | | bool use_direct_io, |
131 | | bool check_tensors, |
132 | | bool no_alloc, |
133 | | const llama_model_kv_override * param_overrides_p, |
134 | | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); |
135 | | |
136 | | template<typename T> |
137 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
138 | | get_arr_n(const std::string & key, T & result, bool required = true); |
139 | | |
140 | | template<typename T> |
141 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
142 | | get_arr_n(enum llm_kv kid, T & result, bool required = true); |
143 | | |
144 | | template<typename T> |
145 | | bool get_arr(const std::string & key, std::vector<T> & result, bool required = true); |
146 | | |
147 | | template<typename T, size_t N_MAX> |
148 | | bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true); |
149 | | |
150 | | template<typename T> |
151 | | bool get_arr(enum llm_kv kid, T & result, bool required = true); |
152 | | |
153 | | template<typename T> |
154 | | bool get_key(const std::string & key, T & result, bool required = true); |
155 | | |
156 | | template<typename T> |
157 | | bool get_key(enum llm_kv kid, T & result, bool required = true); |
158 | | |
159 | | template<typename T, size_t N_MAX> |
160 | | bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true); |
161 | | |
162 | | template<typename T> |
163 | | bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); |
164 | | |
165 | | bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true); |
166 | | |
167 | | std::string get_arch_name() const; |
168 | | |
169 | | enum llm_arch get_arch() const; |
170 | | |
171 | | const llama_tensor_weight * get_weight(const char * name) const; |
172 | | |
173 | | const llama_tensor_weight & require_weight(const char * name) const; |
174 | | |
175 | | struct ggml_tensor * get_tensor_meta(const char * name) const; |
176 | | |
177 | | struct ggml_tensor * require_tensor_meta(const std::string & name) const; |
178 | | |
179 | | const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const; |
180 | | |
181 | | struct ggml_tensor * create_tensor( |
182 | | const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output, |
183 | | const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags); |
184 | | |
185 | | struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true); |
186 | | |
187 | | void done_getting_tensors(bool partial = false) const; |
188 | | |
189 | | void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr); |
190 | | |
191 | | void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const; |
192 | | |
193 | | // for backwards compatibility, does not support ggml-backend |
194 | | void load_data_for(struct ggml_tensor * cur) const; |
195 | | |
196 | | // Returns false if cancelled by progress_callback |
197 | | bool load_all_data( |
198 | | struct ggml_context * ctx, |
199 | | llama_buf_map & bufs, |
200 | | llama_mlocks * lmlocks, |
201 | | llama_progress_callback progress_callback, |
202 | | void * progress_callback_user_data); |
203 | | |
204 | | std::string ftype_name() const; |
205 | | |
206 | | void print_info() const; |
207 | | }; |