/src/llama.cpp/src/llama.cpp
Line | Count | Source |
1 | | #include "llama.h" |
2 | | |
3 | | #include "llama-impl.h" |
4 | | |
5 | | #include "llama-chat.h" |
6 | | #include "llama-context.h" |
7 | | #include "llama-mmap.h" |
8 | | #include "llama-vocab.h" |
9 | | #include "llama-model-loader.h" |
10 | | #include "llama-model-saver.h" |
11 | | #include "llama-model.h" |
12 | | |
13 | | #include "ggml.h" |
14 | | #include "ggml-backend.h" |
15 | | |
16 | | #include <algorithm> |
17 | | #include <cassert> |
18 | | #include <cinttypes> |
19 | | #include <cstddef> |
20 | | #include <cstdint> |
21 | | #include <cstdio> |
22 | | #include <cstring> |
23 | | #include <ctime> |
24 | | #include <stdexcept> |
25 | | |
26 | | #if defined(_MSC_VER) |
27 | | #pragma warning(disable: 4244 4267) // possible loss of data |
28 | | #endif |
29 | | |
30 | | // |
31 | | // interface implementation |
32 | | // |
33 | | |
34 | 0 | const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) { |
35 | 0 | switch (flash_attn_type) { |
36 | 0 | case LLAMA_FLASH_ATTN_TYPE_AUTO: |
37 | 0 | return "auto"; |
38 | 0 | case LLAMA_FLASH_ATTN_TYPE_DISABLED: |
39 | 0 | return "disabled"; |
40 | 0 | case LLAMA_FLASH_ATTN_TYPE_ENABLED: |
41 | 0 | return "enabled"; |
42 | 0 | } |
43 | 0 | GGML_ABORT("fatal error"); |
44 | 0 | } |
45 | | |
46 | | struct llama_device_memory_data { |
47 | | int64_t total; |
48 | | int64_t free; |
49 | | llama_memory_breakdown_data mb; |
50 | | }; |
51 | | |
52 | | static std::vector<llama_device_memory_data> llama_get_device_memory_data( |
53 | | const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, |
54 | | std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert, |
55 | 0 | const ggml_log_level log_level) { |
56 | 0 | struct user_data_t { |
57 | 0 | struct { |
58 | 0 | ggml_log_callback callback; |
59 | 0 | void * user_data; |
60 | 0 | } original_logger; |
61 | 0 | ggml_log_level min_level; // prints below this log level go to debug log |
62 | 0 | }; |
63 | 0 | user_data_t ud; |
64 | 0 | llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); |
65 | 0 | ud.min_level = log_level; |
66 | |
|
67 | 0 | llama_log_set([](ggml_log_level level, const char * text, void * user_data) { |
68 | 0 | const user_data_t * ud = (const user_data_t *) user_data; |
69 | 0 | const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; |
70 | 0 | ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); |
71 | 0 | }, &ud); |
72 | |
|
73 | 0 | llama_model_params mparams_copy = *mparams; |
74 | 0 | mparams_copy.no_alloc = true; |
75 | 0 | mparams_copy.use_mmap = false; |
76 | 0 | mparams_copy.use_mlock = false; |
77 | |
|
78 | 0 | llama_model * model = llama_model_load_from_file(path_model, mparams_copy); |
79 | 0 | if (model == nullptr) { |
80 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
81 | 0 | throw std::runtime_error("failed to load model"); |
82 | 0 | } |
83 | | |
84 | 0 | llama_context * ctx = llama_init_from_model(model, *cparams); |
85 | 0 | if (ctx == nullptr) { |
86 | 0 | llama_model_free(model); |
87 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
88 | 0 | throw std::runtime_error("failed to create llama_context from model"); |
89 | 0 | } |
90 | | |
91 | 0 | std::vector<llama_device_memory_data> ret(model->devices.size()); |
92 | |
|
93 | 0 | std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown(); |
94 | |
|
95 | 0 | for (const auto & [buft, mb] : memory_breakdown) { |
96 | 0 | if (ggml_backend_buft_is_host(buft)) { |
97 | 0 | continue; |
98 | 0 | } |
99 | | |
100 | 0 | ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); |
101 | 0 | if (!dev) { |
102 | 0 | continue; |
103 | 0 | } |
104 | 0 | for (size_t i = 0; i < ret.size(); i++) { |
105 | 0 | if (model->devices[i] == dev) { |
106 | 0 | ret[i].mb.model += mb.model; |
107 | 0 | ret[i].mb.context += mb.context; |
108 | 0 | ret[i].mb.compute += mb.compute; |
109 | 0 | break; |
110 | 0 | } |
111 | 0 | } |
112 | 0 | } |
113 | 0 | for (size_t i = 0; i < ret.size(); i++) { |
114 | 0 | size_t free; |
115 | 0 | size_t total; |
116 | 0 | ggml_backend_dev_memory(model->devices[i], &free, &total); |
117 | | |
118 | | // devices can return 0 bytes for free and total memory if they do not |
119 | | // have any to report. in this case, we will use the host memory as a fallback |
120 | | // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 |
121 | 0 | if (free == 0 && total == 0) { |
122 | 0 | ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
123 | 0 | if (cpu_dev == nullptr) { |
124 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
125 | 0 | } |
126 | 0 | ggml_backend_dev_memory(cpu_dev, &free, &total); |
127 | 0 | } |
128 | 0 | ret[i].free = free; |
129 | 0 | ret[i].total = total; |
130 | 0 | } |
131 | | |
132 | 0 | devs = model->devices; |
133 | 0 | hp_ngl = model->hparams.n_layer; |
134 | 0 | hp_n_ctx_train = model->hparams.n_ctx_train; |
135 | 0 | hp_n_expert = model->hparams.n_expert; |
136 | |
|
137 | 0 | llama_memory_breakdown_print(ctx); // goes to debug log |
138 | |
|
139 | 0 | llama_free(ctx); |
140 | 0 | llama_model_free(model); |
141 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
142 | 0 | return ret; |
143 | 0 | } |
144 | | |
145 | | // enum to identify part of a layer for distributing its tensors: |
146 | | enum layer_fraction_t { |
147 | | LAYER_FRACTION_NONE = 0, // nothing |
148 | | LAYER_FRACTION_ATTN = 1, // attention |
149 | | LAYER_FRACTION_UP = 2, // attention + up |
150 | | LAYER_FRACTION_GATE = 3, // attention + up + gate |
151 | | LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights |
152 | | }; |
153 | | // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue |
154 | | |
155 | | class llama_params_fit_exception : public std::runtime_error { |
156 | | using std::runtime_error::runtime_error; |
157 | | }; |
158 | | |
159 | | static void llama_params_fit_impl( |
160 | | const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, |
161 | | float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, |
162 | 0 | size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { |
163 | 0 | constexpr int64_t MiB = 1024*1024; |
164 | 0 | typedef std::vector<llama_device_memory_data> dmds_t; |
165 | 0 | const llama_model_params default_mparams = llama_model_default_params(); |
166 | |
|
167 | 0 | std::vector<ggml_backend_dev_t> devs; |
168 | 0 | uint32_t hp_ngl = 0; // hparams.n_gpu_layers |
169 | 0 | uint32_t hp_nct = 0; // hparams.n_ctx_train |
170 | 0 | uint32_t hp_nex = 0; // hparams.n_expert |
171 | | |
172 | | // step 1: get data for default parameters and check whether any changes are necessary in the first place |
173 | |
|
174 | 0 | LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); |
175 | 0 | const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
176 | 0 | const size_t nd = devs.size(); // number of devices |
177 | 0 | if (nd == 0) { |
178 | 0 | LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); |
179 | 0 | return; |
180 | 0 | } |
181 | | |
182 | 0 | std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits |
183 | 0 | margins.reserve(nd); |
184 | 0 | for (size_t id = 0; id < nd; id++) { |
185 | 0 | margins.push_back(margins_s[id]); |
186 | 0 | } |
187 | |
|
188 | 0 | std::vector<std::string> dev_names; |
189 | 0 | { |
190 | 0 | dev_names.reserve(nd); |
191 | 0 | size_t max_length = 0; |
192 | 0 | for (ggml_backend_dev_t dev : devs) { |
193 | 0 | std::string name = ggml_backend_dev_name(dev); |
194 | 0 | name += " ("; |
195 | 0 | name += ggml_backend_dev_description(dev); |
196 | 0 | name += ")"; |
197 | 0 | dev_names.push_back(name); |
198 | 0 | max_length = std::max(max_length, name.length()); |
199 | 0 | } |
200 | 0 | for (std::string & dn : dev_names) { |
201 | 0 | dn.insert(dn.end(), max_length - dn.length(), ' '); |
202 | 0 | } |
203 | 0 | } |
204 | |
|
205 | 0 | int64_t sum_free = 0; |
206 | 0 | int64_t sum_projected_free = 0; |
207 | 0 | int64_t sum_projected_used = 0; |
208 | 0 | int64_t sum_projected_model = 0; |
209 | 0 | std::vector<int64_t> projected_free_per_device; |
210 | 0 | projected_free_per_device.reserve(nd); |
211 | |
|
212 | 0 | if (nd > 1) { |
213 | 0 | LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); |
214 | 0 | } |
215 | 0 | for (size_t id = 0; id < nd; id++) { |
216 | 0 | const llama_device_memory_data & dmd = dmds_full[id]; |
217 | |
|
218 | 0 | const int64_t projected_used = dmd.mb.total(); |
219 | 0 | const int64_t projected_free = dmd.free - projected_used; |
220 | 0 | projected_free_per_device.push_back(projected_free); |
221 | |
|
222 | 0 | sum_free += dmd.free; |
223 | 0 | sum_projected_used += projected_used; |
224 | 0 | sum_projected_free += projected_free; |
225 | 0 | sum_projected_model += dmd.mb.model; |
226 | |
|
227 | 0 | if (nd > 1) { |
228 | 0 | LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", |
229 | 0 | __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); |
230 | 0 | } |
231 | 0 | } |
232 | 0 | assert(sum_free >= 0 && sum_projected_used >= 0); |
233 | 0 | LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", |
234 | 0 | __func__, sum_projected_used/MiB, sum_free/MiB); |
235 | 0 | if (nd == 1) { |
236 | 0 | if (projected_free_per_device[0] >= margins[0]) { |
237 | 0 | LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", |
238 | 0 | __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); |
239 | 0 | return; |
240 | 0 | } |
241 | 0 | } else { |
242 | 0 | bool changes_needed = false; |
243 | 0 | for (size_t id = 0; id < nd; id++) { |
244 | 0 | if (projected_free_per_device[id] < margins[id]) { |
245 | 0 | changes_needed = true; |
246 | 0 | break; |
247 | 0 | } |
248 | 0 | } |
249 | 0 | if (!changes_needed) { |
250 | 0 | LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); |
251 | 0 | return; |
252 | 0 | } |
253 | 0 | } |
254 | | |
255 | | // step 2: try reducing memory use by reducing the context size |
256 | | |
257 | 0 | { |
258 | 0 | int64_t global_surplus = sum_projected_free; |
259 | 0 | for (size_t id = 0; id < nd; id++) { |
260 | 0 | global_surplus -= margins[id]; |
261 | 0 | } |
262 | 0 | if (global_surplus < 0) { |
263 | 0 | if (nd == 1) { |
264 | 0 | LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", |
265 | 0 | __func__, margins[0]/MiB, -global_surplus/MiB); |
266 | 0 | } else { |
267 | 0 | LLAMA_LOG_INFO( |
268 | 0 | "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n", |
269 | 0 | __func__, -global_surplus/MiB); |
270 | 0 | } |
271 | 0 | if (cparams->n_ctx == 0) { |
272 | 0 | if (hp_nct > n_ctx_min) { |
273 | 0 | int64_t sum_used_target = sum_free; |
274 | 0 | for (size_t id = 0; id < nd; id++) { |
275 | 0 | sum_used_target -= margins[id]; |
276 | 0 | } |
277 | 0 | if (nd > 1) { |
278 | | // for multiple devices we need to be more conservative in terms of how much context we think can fit: |
279 | | // - for dense models only whole layers can be assigned to devices |
280 | | // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer |
281 | | // - on average we expect a waste of 0.5 layers/tensors per device |
282 | | // - use slightly more than the expected average for nd devices to be safe |
283 | 0 | const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl); |
284 | 0 | sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6); |
285 | 0 | } |
286 | |
|
287 | 0 | int64_t sum_projected_used_min_ctx = 0; |
288 | 0 | cparams->n_ctx = n_ctx_min; |
289 | 0 | const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
290 | 0 | for (const auto & dmd : dmds_min_ctx) { |
291 | 0 | sum_projected_used_min_ctx += dmd.mb.total(); |
292 | 0 | } |
293 | 0 | if (sum_used_target > sum_projected_used_min_ctx) { |
294 | | // linear interpolation between minimum and maximum context size: |
295 | 0 | cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx) |
296 | 0 | / (sum_projected_used - sum_projected_used_min_ctx); |
297 | 0 | cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend |
298 | |
|
299 | 0 | const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min); |
300 | 0 | const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; |
301 | 0 | LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", |
302 | 0 | __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); |
303 | 0 | if (nd == 1) { |
304 | 0 | LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); |
305 | 0 | return; |
306 | 0 | } |
307 | 0 | LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__); |
308 | 0 | } else { |
309 | 0 | const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx; |
310 | 0 | LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", |
311 | 0 | __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); |
312 | 0 | } |
313 | 0 | } else { |
314 | 0 | LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", |
315 | 0 | __func__, hp_nct, n_ctx_min); |
316 | 0 | } |
317 | 0 | } else { |
318 | 0 | LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); |
319 | 0 | } |
320 | 0 | } |
321 | 0 | } |
322 | | |
323 | 0 | if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { |
324 | 0 | throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); |
325 | 0 | } |
326 | 0 | if (nd > 1) { |
327 | 0 | if (!tensor_split) { |
328 | 0 | throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort"); |
329 | 0 | } |
330 | 0 | if (mparams->tensor_split) { |
331 | 0 | for (size_t id = 0; id < nd; id++) { |
332 | 0 | if (mparams->tensor_split[id] != 0.0f) { |
333 | 0 | throw llama_params_fit_exception("model_params::tensor_split already set by user, abort"); |
334 | 0 | } |
335 | 0 | } |
336 | 0 | } |
337 | 0 | if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { |
338 | 0 | throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); |
339 | 0 | } |
340 | 0 | } |
341 | 0 | if (!tensor_buft_overrides) { |
342 | 0 | throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort"); |
343 | 0 | } |
344 | 0 | if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { |
345 | 0 | throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort"); |
346 | 0 | } |
347 | | |
348 | | // step 3: iteratively fill the back to front with "dense" layers |
349 | | // - for a dense model simply fill full layers, giving each device a contiguous slice of the model |
350 | | // - for a MoE model, same as dense model but with all MoE tensors in system memory |
351 | | |
352 | | // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction: |
353 | 0 | auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * { |
354 | 0 | constexpr size_t n_strings = 1000; |
355 | 0 | if (il >= n_strings) { |
356 | 0 | throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported"); |
357 | 0 | } |
358 | 0 | switch (lf) { |
359 | 0 | case LAYER_FRACTION_ATTN: { |
360 | 0 | static std::array<std::string, n_strings> patterns; |
361 | 0 | if (patterns[il].empty()) { |
362 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*"; |
363 | 0 | } |
364 | 0 | return patterns[il].c_str(); |
365 | 0 | } |
366 | 0 | case LAYER_FRACTION_UP: { |
367 | 0 | static std::array<std::string, n_strings> patterns; |
368 | 0 | if (patterns[il].empty()) { |
369 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*"; |
370 | 0 | } |
371 | 0 | return patterns[il].c_str(); |
372 | 0 | } |
373 | 0 | case LAYER_FRACTION_GATE: { |
374 | 0 | static std::array<std::string, n_strings> patterns; |
375 | 0 | if (patterns[il].empty()) { |
376 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*"; |
377 | 0 | } |
378 | 0 | return patterns[il].c_str(); |
379 | 0 | } |
380 | 0 | case LAYER_FRACTION_MOE: { |
381 | 0 | static std::array<std::string, n_strings> patterns; |
382 | 0 | if (patterns[il].empty()) { |
383 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps"; |
384 | 0 | } |
385 | 0 | return patterns[il].c_str(); |
386 | 0 | } |
387 | 0 | default: |
388 | 0 | GGML_ABORT("fatal error"); |
389 | 0 | } |
390 | 0 | }; |
391 | |
|
392 | 0 | struct ngl_t { |
393 | 0 | uint32_t n_layer = 0; // number of total layers |
394 | 0 | uint32_t n_part = 0; // number of partial layers, <= n_layer |
395 | | |
396 | | // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE: |
397 | 0 | layer_fraction_t overflow_type = LAYER_FRACTION_MOE; |
398 | |
|
399 | 0 | uint32_t n_full() const { |
400 | 0 | assert(n_layer >= n_part); |
401 | 0 | return n_layer - n_part; |
402 | 0 | } |
403 | 0 | }; |
404 | |
|
405 | 0 | const size_t ntbo = llama_max_tensor_buft_overrides(); |
406 | | |
407 | | // utility function to set n_gpu_layers and tensor_split |
408 | 0 | auto set_ngl_tensor_split_tbo = [&]( |
409 | 0 | const std::vector<ngl_t> & ngl_per_device, |
410 | 0 | const std::vector<ggml_backend_buffer_type_t> & overflow_bufts, |
411 | 0 | llama_model_params & mparams) { |
412 | 0 | mparams.n_gpu_layers = 0; |
413 | 0 | for (size_t id = 0; id < nd; id++) { |
414 | 0 | mparams.n_gpu_layers += ngl_per_device[id].n_layer; |
415 | 0 | if (nd > 1) { |
416 | 0 | tensor_split[id] = ngl_per_device[id].n_layer; |
417 | 0 | } |
418 | 0 | } |
419 | 0 | assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1); |
420 | 0 | uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides |
421 | |
|
422 | 0 | mparams.tensor_split = tensor_split; |
423 | |
|
424 | 0 | size_t itbo = 0; |
425 | 0 | for (size_t id = 0; id < nd; id++) { |
426 | 0 | il0 += ngl_per_device[id].n_full(); |
427 | 0 | for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) { |
428 | 0 | if (itbo + 1 >= ntbo) { |
429 | 0 | tensor_buft_overrides[itbo].pattern = nullptr; |
430 | 0 | tensor_buft_overrides[itbo].buft = nullptr; |
431 | 0 | itbo++; |
432 | 0 | mparams.tensor_buft_overrides = tensor_buft_overrides; |
433 | 0 | throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == " |
434 | 0 | + std::to_string(ntbo) + " is insufficient for model"); |
435 | 0 | } |
436 | 0 | tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE); |
437 | 0 | tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type(); |
438 | 0 | itbo++; |
439 | 0 | } |
440 | 0 | il0 += ngl_per_device[id].n_part; |
441 | 0 | } |
442 | 0 | tensor_buft_overrides[itbo].pattern = nullptr; |
443 | 0 | tensor_buft_overrides[itbo].buft = nullptr; |
444 | 0 | itbo++; |
445 | 0 | mparams.tensor_buft_overrides = tensor_buft_overrides; |
446 | 0 | }; |
447 | | |
448 | | // utility function that returns the memory use per device for given numbers of layers per device |
449 | 0 | auto get_memory_for_layers = [&]( |
450 | 0 | const char * func_name, |
451 | 0 | const std::vector<ngl_t> & ngl_per_device, |
452 | 0 | const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> { |
453 | 0 | llama_model_params mparams_copy = *mparams; |
454 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); |
455 | |
|
456 | 0 | const dmds_t dmd_nl = llama_get_device_memory_data( |
457 | 0 | path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
458 | |
|
459 | 0 | LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name); |
460 | 0 | for (size_t id = 0; id < nd; id++) { |
461 | 0 | const ngl_t & n = ngl_per_device[id]; |
462 | 0 | LLAMA_LOG_DEBUG( |
463 | 0 | "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n", |
464 | 0 | func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB); |
465 | 0 | } |
466 | |
|
467 | 0 | std::vector<int64_t> ret; |
468 | 0 | ret.reserve(nd); |
469 | 0 | for (const llama_device_memory_data & dmd : dmd_nl) { |
470 | 0 | ret.push_back(dmd.mb.total()); |
471 | 0 | } |
472 | 0 | return ret; |
473 | 0 | }; |
474 | |
|
475 | 0 | int64_t global_surplus_cpu_moe = 0; |
476 | 0 | if (hp_nex > 0) { |
477 | 0 | const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors |
478 | 0 | ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type(); |
479 | 0 | tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft}; |
480 | 0 | tensor_buft_overrides[1] = {nullptr, nullptr}; |
481 | 0 | mparams->tensor_buft_overrides = tensor_buft_overrides; |
482 | |
|
483 | 0 | LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); |
484 | 0 | const dmds_t dmds_cpu_moe = llama_get_device_memory_data( |
485 | 0 | path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
486 | |
|
487 | 0 | for (size_t id = 0; id < nd; id++) { |
488 | 0 | global_surplus_cpu_moe += dmds_cpu_moe[id].free; |
489 | 0 | global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id]; |
490 | 0 | } |
491 | |
|
492 | 0 | if (global_surplus_cpu_moe > 0) { |
493 | 0 | LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", |
494 | 0 | __func__, global_surplus_cpu_moe/MiB); |
495 | 0 | } else { |
496 | 0 | LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", |
497 | 0 | __func__, -global_surplus_cpu_moe/MiB); |
498 | 0 | } |
499 | | |
500 | | // reset |
501 | 0 | tensor_buft_overrides[0] = {nullptr, nullptr}; |
502 | 0 | mparams->tensor_buft_overrides = tensor_buft_overrides; |
503 | 0 | } |
504 | |
|
505 | 0 | std::vector<int64_t> targets; // maximum acceptable memory use per device |
506 | 0 | targets.reserve(nd); |
507 | 0 | for (size_t id = 0; id < nd; id++) { |
508 | 0 | targets.push_back(dmds_full[id].free - margins[id]); |
509 | 0 | LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); |
510 | 0 | } |
511 | |
|
512 | 0 | std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to: |
513 | 0 | overflow_bufts.reserve(nd); |
514 | 0 | for (size_t id = 0; id < nd; id++) { |
515 | 0 | overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); |
516 | 0 | } |
517 | |
|
518 | 0 | std::vector<ngl_t> ngl_per_device(nd); |
519 | 0 | std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts); |
520 | | |
521 | | // optimize the number of layers per device using the method of false position: |
522 | | // - ngl_per_device has 0 layers for each device, lower bound |
523 | | // - try a "high" configuration where a device is given all unassigned layers |
524 | | // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target |
525 | | // - check memory use of our guess, replace either the low or high bound |
526 | | // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits |
527 | | // - the last device has the output layer, which cannot be a partial layer |
528 | 0 | if (hp_nex == 0) { |
529 | 0 | LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__); |
530 | 0 | } else { |
531 | 0 | LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__); |
532 | 0 | } |
533 | 0 | for (int id = nd - 1; id >= 0; id--) { |
534 | 0 | uint32_t n_unassigned = hp_ngl + 1; |
535 | 0 | for (size_t jd = id + 1; jd < nd; ++jd) { |
536 | 0 | assert(n_unassigned >= ngl_per_device[jd].n_layer); |
537 | 0 | n_unassigned -= ngl_per_device[jd].n_layer; |
538 | 0 | } |
539 | |
|
540 | 0 | std::vector<ngl_t> ngl_per_device_high = ngl_per_device; |
541 | 0 | ngl_per_device_high[id].n_layer = n_unassigned; |
542 | 0 | if (hp_nex > 0) { |
543 | 0 | ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1; |
544 | 0 | } |
545 | 0 | if (ngl_per_device_high[id].n_layer > 0) { |
546 | 0 | std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); |
547 | 0 | if (mem_high[id] > targets[id]) { |
548 | 0 | assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); |
549 | 0 | uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; |
550 | 0 | LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); |
551 | 0 | while (delta > 1) { |
552 | 0 | uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); |
553 | 0 | step_size = std::max(step_size, uint32_t(1)); |
554 | 0 | step_size = std::min(step_size, delta - 1); |
555 | |
|
556 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
557 | 0 | ngl_per_device_test[id].n_layer += step_size; |
558 | 0 | if (hp_nex) { |
559 | 0 | ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ? |
560 | 0 | step_size - 1 : step_size; // the first layer is the output layer which must always be full |
561 | 0 | } |
562 | 0 | const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); |
563 | |
|
564 | 0 | if (mem_test[id] <= targets[id]) { |
565 | 0 | ngl_per_device = ngl_per_device_test; |
566 | 0 | mem = mem_test; |
567 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); |
568 | 0 | } else { |
569 | 0 | ngl_per_device_high = ngl_per_device_test; |
570 | 0 | mem_high = mem_test; |
571 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); |
572 | 0 | } |
573 | 0 | delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; |
574 | 0 | } |
575 | 0 | } else { |
576 | 0 | assert(ngl_per_device_high[id].n_layer == n_unassigned); |
577 | 0 | ngl_per_device = ngl_per_device_high; |
578 | 0 | mem = mem_high; |
579 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); |
580 | 0 | } |
581 | 0 | } |
582 | |
|
583 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
584 | 0 | LLAMA_LOG_INFO( |
585 | 0 | "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
586 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); |
587 | 0 | } |
588 | 0 | if (hp_nex == 0 || global_surplus_cpu_moe <= 0) { |
589 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); |
590 | 0 | return; |
591 | 0 | } |
592 | | |
593 | | // step 4: for a MoE model where all dense tensors fit, |
594 | | // convert the dense-only layers in the back to full layers in the front until all devices are full |
595 | | // essentially the same procedure as for the dense-only layers except front-to-back |
596 | | // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM |
597 | | |
598 | 0 | size_t id_dense_start = nd; |
599 | 0 | for (int id = nd - 1; id >= 0; id--) { |
600 | 0 | if (ngl_per_device[id].n_layer > 0) { |
601 | 0 | id_dense_start = id; |
602 | 0 | continue; |
603 | 0 | } |
604 | 0 | break; |
605 | 0 | } |
606 | 0 | assert(id_dense_start < nd); |
607 | |
|
608 | 0 | LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__); |
609 | 0 | for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) { |
610 | 0 | std::vector<ngl_t> ngl_per_device_high = ngl_per_device; |
611 | 0 | for (size_t jd = id_dense_start; jd < nd; jd++) { |
612 | 0 | const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1; |
613 | 0 | ngl_per_device_high[id].n_layer += n_layer_move; |
614 | 0 | ngl_per_device_high[jd].n_layer -= n_layer_move; |
615 | 0 | ngl_per_device_high[jd].n_part = 0; |
616 | 0 | } |
617 | 0 | size_t id_dense_start_high = nd - 1; |
618 | 0 | std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); |
619 | |
|
620 | 0 | if (mem_high[id] > targets[id]) { |
621 | 0 | assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); |
622 | 0 | uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); |
623 | 0 | while (delta > 1) { |
624 | 0 | uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); |
625 | 0 | step_size = std::max(step_size, uint32_t(1)); |
626 | 0 | step_size = std::min(step_size, delta - 1); |
627 | |
|
628 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
629 | 0 | size_t id_dense_start_test = id_dense_start; |
630 | 0 | uint32_t n_converted_test = 0; |
631 | 0 | for (;id_dense_start_test < nd; id_dense_start_test++) { |
632 | 0 | const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part); |
633 | 0 | ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd; |
634 | 0 | ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd; |
635 | 0 | ngl_per_device_test[id].n_layer += n_convert_jd; |
636 | 0 | n_converted_test += n_convert_jd; |
637 | |
|
638 | 0 | if (ngl_per_device_test[id_dense_start_test].n_part > 0) { |
639 | 0 | break; |
640 | 0 | } |
641 | 0 | } |
642 | 0 | const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); |
643 | |
|
644 | 0 | if (mem_test[id] <= targets[id]) { |
645 | 0 | ngl_per_device = ngl_per_device_test; |
646 | 0 | mem = mem_test; |
647 | 0 | id_dense_start = id_dense_start_test; |
648 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", |
649 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
650 | 0 | } else { |
651 | 0 | ngl_per_device_high = ngl_per_device_test; |
652 | 0 | mem_high = mem_test; |
653 | 0 | id_dense_start_high = id_dense_start_test; |
654 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n", |
655 | 0 | __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high); |
656 | 0 | } |
657 | 0 | assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); |
658 | 0 | delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); |
659 | 0 | } |
660 | 0 | } else { |
661 | 0 | ngl_per_device = ngl_per_device_high; |
662 | 0 | mem = mem_high; |
663 | 0 | id_dense_start = id_dense_start_high; |
664 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", |
665 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
666 | 0 | } |
667 | | |
668 | | // try to fit at least part of one more layer |
669 | 0 | if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) { |
670 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
671 | 0 | size_t id_dense_start_test = id_dense_start; |
672 | 0 | ngl_per_device_test[id_dense_start_test].n_layer--; |
673 | 0 | ngl_per_device_test[id_dense_start_test].n_part--; |
674 | 0 | ngl_per_device_test[id].n_layer++; |
675 | 0 | ngl_per_device_test[id].n_part++; |
676 | 0 | if (ngl_per_device_test[id_dense_start_test].n_part == 0) { |
677 | 0 | id_dense_start_test++; |
678 | 0 | } |
679 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP; |
680 | 0 | std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts; |
681 | 0 | if (id < nd - 1) { |
682 | 0 | overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]); |
683 | 0 | } |
684 | 0 | LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); |
685 | 0 | std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
686 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
687 | 0 | ngl_per_device = ngl_per_device_test; |
688 | 0 | overflow_bufts = overflow_bufts_test; |
689 | 0 | mem = mem_test; |
690 | 0 | id_dense_start = id_dense_start_test; |
691 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n", |
692 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
693 | |
|
694 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; |
695 | 0 | LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); |
696 | 0 | mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
697 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
698 | 0 | ngl_per_device = ngl_per_device_test; |
699 | 0 | overflow_bufts = overflow_bufts_test; |
700 | 0 | mem = mem_test; |
701 | 0 | id_dense_start = id_dense_start_test; |
702 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n", |
703 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
704 | 0 | } |
705 | 0 | } else { |
706 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; |
707 | 0 | LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); |
708 | 0 | mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
709 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
710 | 0 | ngl_per_device = ngl_per_device_test; |
711 | 0 | overflow_bufts = overflow_bufts_test; |
712 | 0 | mem = mem_test; |
713 | 0 | id_dense_start = id_dense_start_test; |
714 | 0 | LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n", |
715 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
716 | 0 | } |
717 | 0 | } |
718 | 0 | } |
719 | |
|
720 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
721 | 0 | LLAMA_LOG_INFO( |
722 | 0 | "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
723 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); |
724 | 0 | } |
725 | | |
726 | | // print info for devices that were not changed during the conversion from dense only to full layers: |
727 | 0 | for (size_t id = id_dense_start + 1; id < nd; id++) { |
728 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
729 | 0 | LLAMA_LOG_INFO( |
730 | 0 | "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
731 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); |
732 | 0 | } |
733 | |
|
734 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); |
735 | 0 | } |
736 | | |
737 | | enum llama_params_fit_status llama_params_fit( |
738 | | const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, |
739 | | float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, |
740 | 0 | size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) { |
741 | 0 | const int64_t t0_us = llama_time_us(); |
742 | 0 | llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS; |
743 | 0 | try { |
744 | 0 | llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level); |
745 | 0 | LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); |
746 | 0 | } catch (const llama_params_fit_exception & e) { |
747 | 0 | LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); |
748 | 0 | status = LLAMA_PARAMS_FIT_STATUS_FAILURE; |
749 | 0 | } catch (const std::runtime_error & e) { |
750 | 0 | LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what()); |
751 | 0 | status = LLAMA_PARAMS_FIT_STATUS_ERROR; |
752 | 0 | } |
753 | 0 | const int64_t t1_us = llama_time_us(); |
754 | 0 | LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); |
755 | 0 | return status; |
756 | 0 | } |
757 | | |
758 | 0 | struct llama_sampler_chain_params llama_sampler_chain_default_params() { |
759 | 0 | struct llama_sampler_chain_params result = { |
760 | 0 | /*.no_perf =*/ true, |
761 | 0 | }; |
762 | |
|
763 | 0 | return result; |
764 | 0 | } |
765 | | |
766 | 0 | size_t llama_max_devices(void) { |
767 | 0 | return 16; |
768 | 0 | } |
769 | | |
770 | 0 | size_t llama_max_tensor_buft_overrides() { |
771 | 0 | return 4096; |
772 | 0 | } |
773 | | |
774 | 0 | bool llama_supports_mmap(void) { |
775 | 0 | return llama_mmap::SUPPORTED; |
776 | 0 | } |
777 | | |
778 | 0 | bool llama_supports_mlock(void) { |
779 | 0 | return llama_mlock::SUPPORTED; |
780 | 0 | } |
781 | | |
782 | 0 | bool llama_supports_gpu_offload(void) { |
783 | 0 | return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || |
784 | 0 | ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr || |
785 | 0 | llama_supports_rpc(); |
786 | 0 | } |
787 | | |
788 | 0 | bool llama_supports_rpc(void) { |
789 | 0 | return ggml_backend_reg_by_name("RPC") != nullptr; |
790 | 0 | } |
791 | | |
792 | 0 | void llama_backend_init(void) { |
793 | 0 | ggml_time_init(); |
794 | | |
795 | | // needed to initialize f16 tables |
796 | 0 | { |
797 | 0 | struct ggml_init_params params = { 0, NULL, false }; |
798 | 0 | struct ggml_context * ctx = ggml_init(params); |
799 | 0 | ggml_free(ctx); |
800 | 0 | } |
801 | 0 | } |
802 | | |
803 | 0 | void llama_numa_init(enum ggml_numa_strategy numa) { |
804 | 0 | if (numa != GGML_NUMA_STRATEGY_DISABLED) { |
805 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
806 | 0 | GGML_ASSERT(dev && "CPU backend is not loaded"); |
807 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
808 | 0 | auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); |
809 | 0 | if (numa_init_fn) { |
810 | 0 | numa_init_fn(numa); |
811 | 0 | } |
812 | 0 | } |
813 | 0 | } |
814 | | |
815 | 0 | void llama_backend_free(void) { |
816 | 0 | ggml_quantize_free(); |
817 | 0 | } |
818 | | |
819 | 0 | int64_t llama_time_us(void) { |
820 | 0 | return ggml_time_us(); |
821 | 0 | } |
822 | | |
823 | | // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback |
824 | 0 | static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) { |
825 | | // loading time will be recalculated after the first eval, so |
826 | | // we take page faults deferred by mmap() into consideration |
827 | 0 | model.t_load_us = 0; |
828 | 0 | time_meas tm(model.t_load_us); |
829 | |
|
830 | 0 | model.t_start_us = tm.t_start_us; |
831 | |
|
832 | 0 | try { |
833 | 0 | llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); |
834 | |
|
835 | 0 | ml.print_info(); |
836 | |
|
837 | 0 | model.hparams.vocab_only = params.vocab_only; |
838 | 0 | model.hparams.no_alloc = params.no_alloc; |
839 | |
|
840 | 0 | try { |
841 | 0 | model.load_arch(ml); |
842 | 0 | } catch(const std::exception & e) { |
843 | 0 | throw std::runtime_error("error loading model architecture: " + std::string(e.what())); |
844 | 0 | } |
845 | 0 | try { |
846 | 0 | model.load_hparams(ml); |
847 | 0 | } catch(const std::exception & e) { |
848 | 0 | throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); |
849 | 0 | } |
850 | 0 | if (model.arch == LLM_ARCH_CLIP) { |
851 | 0 | throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); |
852 | 0 | } |
853 | 0 | try { |
854 | 0 | model.load_vocab(ml); |
855 | 0 | } catch(const std::exception & e) { |
856 | 0 | throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); |
857 | 0 | } |
858 | | |
859 | 0 | model.load_stats(ml); |
860 | 0 | model.print_info(); |
861 | |
|
862 | 0 | if (params.vocab_only) { |
863 | 0 | LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); |
864 | 0 | return 0; |
865 | 0 | } |
866 | | |
867 | 0 | if (!model.load_tensors(ml)) { |
868 | 0 | return -2; |
869 | 0 | } |
870 | 0 | } catch (const std::exception & err) { |
871 | 0 | LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); |
872 | 0 | return -1; |
873 | 0 | } |
874 | | |
875 | 0 | return 0; |
876 | 0 | } |
877 | | |
878 | | static struct llama_model * llama_model_load_from_file_impl( |
879 | | const std::string & path_model, |
880 | | std::vector<std::string> & splits, |
881 | 0 | struct llama_model_params params) { |
882 | 0 | ggml_time_init(); |
883 | |
|
884 | 0 | if (!params.vocab_only && ggml_backend_reg_count() == 0) { |
885 | 0 | LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); |
886 | 0 | return nullptr; |
887 | 0 | } |
888 | | |
889 | 0 | unsigned cur_percentage = 0; |
890 | 0 | if (params.progress_callback == NULL) { |
891 | 0 | params.progress_callback_user_data = &cur_percentage; |
892 | 0 | params.progress_callback = [](float progress, void * ctx) { |
893 | 0 | unsigned * cur_percentage_p = (unsigned *) ctx; |
894 | 0 | unsigned percentage = (unsigned) (100 * progress); |
895 | 0 | while (percentage > *cur_percentage_p) { |
896 | 0 | *cur_percentage_p = percentage; |
897 | 0 | LLAMA_LOG_CONT("."); |
898 | 0 | if (percentage >= 100) { |
899 | 0 | LLAMA_LOG_CONT("\n"); |
900 | 0 | } |
901 | 0 | } |
902 | 0 | return true; |
903 | 0 | }; |
904 | 0 | } |
905 | |
|
906 | 0 | llama_model * model = new llama_model(params); |
907 | | |
908 | | // create list of devices to use with this model |
909 | 0 | if (params.devices) { |
910 | 0 | for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { |
911 | 0 | model->devices.push_back(*dev); |
912 | 0 | } |
913 | 0 | } else { |
914 | | // default device selection |
915 | | |
916 | | // build list of available devices |
917 | 0 | std::vector<ggml_backend_dev_t> gpus; |
918 | 0 | std::vector<ggml_backend_dev_t> igpus; |
919 | 0 | std::vector<ggml_backend_dev_t> rpc_servers; |
920 | |
|
921 | 0 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
922 | 0 | ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
923 | 0 | switch (ggml_backend_dev_type(dev)) { |
924 | 0 | case GGML_BACKEND_DEVICE_TYPE_CPU: |
925 | 0 | case GGML_BACKEND_DEVICE_TYPE_ACCEL: |
926 | | // skip CPU backends since they are handled separately |
927 | 0 | break; |
928 | | |
929 | 0 | case GGML_BACKEND_DEVICE_TYPE_GPU: { |
930 | 0 | ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); |
931 | 0 | if (ggml_backend_reg_name(reg) == std::string("RPC")) { |
932 | 0 | rpc_servers.push_back(dev); |
933 | 0 | } else { |
934 | | // check if there is already a GPU with the same device id |
935 | 0 | ggml_backend_dev_props props; |
936 | 0 | ggml_backend_dev_get_props(dev, &props); |
937 | 0 | auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) { |
938 | 0 | ggml_backend_dev_props d_props; |
939 | 0 | ggml_backend_dev_get_props(d, &d_props); |
940 | 0 | if (props.device_id && d_props.device_id) { |
941 | 0 | return strcmp(props.device_id, d_props.device_id) == 0; |
942 | 0 | } |
943 | 0 | return false; |
944 | 0 | }); |
945 | |
|
946 | 0 | if (it != gpus.end()) { |
947 | 0 | LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n", |
948 | 0 | __func__, |
949 | 0 | ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), |
950 | 0 | props.device_id ? props.device_id : "unknown id", |
951 | 0 | ggml_backend_dev_name(*it), ggml_backend_dev_description(*it)); |
952 | 0 | } else { |
953 | 0 | gpus.push_back(dev); |
954 | 0 | } |
955 | 0 | } |
956 | 0 | break; |
957 | 0 | } |
958 | | |
959 | 0 | case GGML_BACKEND_DEVICE_TYPE_IGPU: |
960 | 0 | igpus.push_back(dev); |
961 | 0 | break; |
962 | 0 | } |
963 | 0 | } |
964 | | |
965 | | // add RPC servers at the front of the list to minimize network transfers |
966 | 0 | model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); |
967 | | |
968 | | // add GPUs |
969 | 0 | model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); |
970 | | |
971 | | // add integrated GPUs only if no other devices were found |
972 | 0 | if (model->devices.empty()) { |
973 | 0 | model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); |
974 | 0 | } |
975 | 0 | } |
976 | | |
977 | | // if using single GPU mode, remove all except the main GPU |
978 | 0 | if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { |
979 | 0 | if (params.main_gpu < 0) { |
980 | 0 | model->devices.clear(); |
981 | 0 | } else { |
982 | 0 | if (params.main_gpu >= (int)model->devices.size()) { |
983 | 0 | LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size()); |
984 | 0 | llama_model_free(model); |
985 | 0 | return nullptr; |
986 | 0 | } |
987 | 0 | ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; |
988 | 0 | model->devices.clear(); |
989 | 0 | model->devices.push_back(main_gpu); |
990 | 0 | } |
991 | 0 | } |
992 | | |
993 | 0 | for (auto * dev : model->devices) { |
994 | 0 | ggml_backend_dev_props props; |
995 | 0 | ggml_backend_dev_get_props(dev, &props); |
996 | 0 | LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__, |
997 | 0 | ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), |
998 | 0 | props.device_id ? props.device_id : "unknown id", |
999 | 0 | props.memory_free/1024/1024); |
1000 | 0 | } |
1001 | |
|
1002 | 0 | const int status = llama_model_load(path_model, splits, *model, params); |
1003 | 0 | GGML_ASSERT(status <= 0); |
1004 | 0 | if (status < 0) { |
1005 | 0 | if (status == -1) { |
1006 | 0 | LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); |
1007 | 0 | } else if (status == -2) { |
1008 | 0 | LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); |
1009 | 0 | } |
1010 | |
|
1011 | 0 | llama_model_free(model); |
1012 | 0 | return nullptr; |
1013 | 0 | } |
1014 | | |
1015 | 0 | return model; |
1016 | 0 | } |
1017 | | |
1018 | | // deprecated |
1019 | | struct llama_model * llama_load_model_from_file( |
1020 | | const char * path_model, |
1021 | 0 | struct llama_model_params params) { |
1022 | 0 | return llama_model_load_from_file(path_model, params); |
1023 | 0 | } |
1024 | | |
1025 | | struct llama_model * llama_model_load_from_file( |
1026 | | const char * path_model, |
1027 | 0 | struct llama_model_params params) { |
1028 | 0 | std::vector<std::string> splits = {}; |
1029 | 0 | return llama_model_load_from_file_impl(path_model, splits, params); |
1030 | 0 | } |
1031 | | |
1032 | | struct llama_model * llama_model_load_from_splits( |
1033 | | const char ** paths, |
1034 | | size_t n_paths, |
1035 | 0 | struct llama_model_params params) { |
1036 | 0 | std::vector<std::string> splits; |
1037 | 0 | if (n_paths == 0) { |
1038 | 0 | LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); |
1039 | 0 | return nullptr; |
1040 | 0 | } |
1041 | 0 | splits.reserve(n_paths); |
1042 | 0 | for (size_t i = 0; i < n_paths; ++i) { |
1043 | 0 | splits.push_back(paths[i]); |
1044 | 0 | } |
1045 | 0 | return llama_model_load_from_file_impl(splits.front(), splits, params); |
1046 | 0 | } |
1047 | | |
1048 | 0 | void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { |
1049 | 0 | llama_model_saver ms(*model); |
1050 | 0 | ms.add_kv_from_model(); |
1051 | 0 | ms.add_tensors_from_model(); |
1052 | 0 | ms.save(path_model); |
1053 | 0 | } |
1054 | | |
1055 | | // |
1056 | | // chat templates |
1057 | | // |
1058 | | |
1059 | | int32_t llama_chat_apply_template( |
1060 | | const char * tmpl, |
1061 | | const struct llama_chat_message * chat, |
1062 | | size_t n_msg, |
1063 | | bool add_ass, |
1064 | | char * buf, |
1065 | 1.67k | int32_t length) { |
1066 | 1.67k | const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl); |
1067 | | |
1068 | | // format the chat to string |
1069 | 1.67k | std::vector<const llama_chat_message *> chat_vec; |
1070 | 1.67k | chat_vec.resize(n_msg); |
1071 | 11.6k | for (size_t i = 0; i < n_msg; i++) { |
1072 | 10.0k | chat_vec[i] = &chat[i]; |
1073 | 10.0k | } |
1074 | | |
1075 | 1.67k | std::string formatted_chat; |
1076 | 1.67k | llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); |
1077 | 1.67k | if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { |
1078 | 805 | return -1; |
1079 | 805 | } |
1080 | 865 | int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); |
1081 | 865 | if (res < 0) { |
1082 | 0 | return res; |
1083 | 0 | } |
1084 | 865 | if (buf && length > 0) { |
1085 | 865 | strncpy(buf, formatted_chat.c_str(), length); |
1086 | 865 | } |
1087 | 865 | return res; |
1088 | 865 | } |
1089 | | |
1090 | | // |
1091 | | // model split |
1092 | | // |
1093 | | |
1094 | 0 | int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { |
1095 | 0 | static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; |
1096 | 0 | if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { |
1097 | 0 | return strlen(split_path); |
1098 | 0 | } |
1099 | 0 | return 0; |
1100 | 0 | } |
1101 | | |
1102 | 0 | int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { |
1103 | 0 | std::string str_split_path(split_path); |
1104 | 0 | char postfix[32]; |
1105 | 0 | snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); |
1106 | 0 | std::string str_postfix(postfix); |
1107 | | |
1108 | | // check if split_prefix ends with postfix |
1109 | 0 | int size_prefix = str_split_path.size() - str_postfix.size(); |
1110 | 0 | if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { |
1111 | 0 | snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); |
1112 | 0 | return size_prefix; |
1113 | 0 | } |
1114 | | |
1115 | 0 | return 0; |
1116 | 0 | } |
1117 | | |
1118 | 0 | const char * llama_print_system_info(void) { |
1119 | 0 | static std::string s; |
1120 | 0 | s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. |
1121 | |
|
1122 | 0 | for (size_t i = 0; i < ggml_backend_reg_count(); i++) { |
1123 | 0 | auto * reg = ggml_backend_reg_get(i); |
1124 | 0 | auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); |
1125 | 0 | if (get_features_fn) { |
1126 | 0 | ggml_backend_feature * features = get_features_fn(reg); |
1127 | 0 | s += ggml_backend_reg_name(reg); |
1128 | 0 | s += " : "; |
1129 | 0 | for (; features->name; features++) { |
1130 | 0 | s += features->name; |
1131 | 0 | s += " = "; |
1132 | 0 | s += features->value; |
1133 | 0 | s += " | "; |
1134 | 0 | } |
1135 | 0 | } |
1136 | 0 | } |
1137 | |
|
1138 | 0 | return s.c_str(); |
1139 | 0 | } |
1140 | | |