/src/llama.cpp/common/fit.cpp
Line | Count | Source |
1 | | #include "fit.h" |
2 | | |
3 | | #include "log.h" |
4 | | |
5 | | #include "../src/llama-ext.h" |
6 | | |
7 | | #include <array> |
8 | | #include <cassert> |
9 | | #include <stdexcept> |
10 | | #include <cinttypes> |
11 | | #include <set> |
12 | | #include <string> |
13 | | #include <vector> |
14 | | |
15 | | // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue |
16 | | // enum to identify part of a layer for distributing its tensors: |
17 | | enum common_layer_fraction_t { |
18 | | LAYER_FRACTION_NONE = 0, // nothing |
19 | | LAYER_FRACTION_ATTN = 1, // attention |
20 | | LAYER_FRACTION_UP = 2, // attention + up |
21 | | LAYER_FRACTION_GATE = 3, // attention + up + gate |
22 | | LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights |
23 | | }; |
24 | | |
25 | | class common_params_fit_exception : public std::runtime_error { |
26 | | using std::runtime_error::runtime_error; |
27 | | }; |
28 | | |
29 | | static std::vector<llama_device_memory_data> common_get_device_memory_data_impl( |
30 | | const char * path_model, |
31 | | const llama_model_params * mparams, |
32 | | const llama_context_params * cparams, |
33 | | std::vector<ggml_backend_dev_t> & devs, |
34 | | uint32_t & hp_ngl, |
35 | | uint32_t & hp_n_ctx_train, |
36 | | uint32_t & hp_n_expert, |
37 | 0 | ggml_log_level log_level) { |
38 | 0 | struct user_data_t { |
39 | 0 | struct { |
40 | 0 | ggml_log_callback callback; |
41 | 0 | void * user_data; |
42 | 0 | } original_logger; |
43 | 0 | ggml_log_level min_level; // prints below this log level go to debug log |
44 | 0 | }; |
45 | 0 | user_data_t ud; |
46 | 0 | llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); |
47 | 0 | ud.min_level = log_level; |
48 | |
|
49 | 0 | llama_log_set([](ggml_log_level level, const char * text, void * user_data) { |
50 | 0 | const user_data_t * ud = (const user_data_t *) user_data; |
51 | 0 | const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; |
52 | 0 | ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); |
53 | 0 | }, &ud); |
54 | |
|
55 | 0 | llama_model_params mparams_copy = *mparams; |
56 | 0 | mparams_copy.no_alloc = true; |
57 | 0 | mparams_copy.use_mmap = false; |
58 | 0 | mparams_copy.use_mlock = false; |
59 | |
|
60 | 0 | llama_model * model = llama_model_load_from_file(path_model, mparams_copy); |
61 | 0 | if (model == nullptr) { |
62 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
63 | 0 | throw std::runtime_error("failed to load model"); |
64 | 0 | } |
65 | | |
66 | 0 | llama_context * ctx = llama_init_from_model(model, *cparams); |
67 | 0 | if (ctx == nullptr) { |
68 | 0 | llama_model_free(model); |
69 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
70 | 0 | throw std::runtime_error("failed to create llama_context from model"); |
71 | 0 | } |
72 | | |
73 | 0 | const size_t nd = llama_model_n_devices(model); |
74 | 0 | std::vector<llama_device_memory_data> ret(nd + 1); |
75 | |
|
76 | 0 | llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx); |
77 | |
|
78 | 0 | for (const auto & [buft, mb] : memory_breakdown) { |
79 | 0 | if (ggml_backend_buft_is_host(buft)) { |
80 | 0 | ret.back().mb.model += mb.model; |
81 | 0 | ret.back().mb.context += mb.context; |
82 | 0 | ret.back().mb.compute += mb.compute; |
83 | 0 | continue; |
84 | 0 | } |
85 | | |
86 | 0 | ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); |
87 | 0 | if (!dev) { |
88 | 0 | continue; |
89 | 0 | } |
90 | 0 | for (size_t i = 0; i < nd; i++) { |
91 | 0 | if (dev == llama_model_get_device(model, i)) { |
92 | 0 | ret[i].mb.model += mb.model; |
93 | 0 | ret[i].mb.context += mb.context; |
94 | 0 | ret[i].mb.compute += mb.compute; |
95 | 0 | break; |
96 | 0 | } |
97 | 0 | } |
98 | 0 | } |
99 | |
|
100 | 0 | { |
101 | 0 | ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
102 | 0 | if (cpu_dev == nullptr) { |
103 | 0 | throw std::runtime_error("no CPU backend found"); |
104 | 0 | } |
105 | 0 | size_t free; |
106 | 0 | size_t total; |
107 | 0 | ggml_backend_dev_memory(cpu_dev, &free, &total); |
108 | 0 | ret.back().free = free; |
109 | 0 | ret.back().total = total; |
110 | 0 | } |
111 | 0 | for (size_t i = 0; i < nd; i++) { |
112 | 0 | ggml_backend_dev_t dev = llama_model_get_device(model, i); |
113 | |
|
114 | 0 | size_t free; |
115 | 0 | size_t total; |
116 | 0 | ggml_backend_dev_memory(dev, &free, &total); |
117 | | |
118 | | // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on |
119 | | // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does |
120 | | // not assign anything to a device with an unknown memory budget. |
121 | 0 | if (free == 0 && total == 0) { |
122 | 0 | const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev); |
123 | 0 | if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) { |
124 | 0 | LOG_WRN("%s: device %s did not report memory; --fit will not use it\n", |
125 | 0 | __func__, ggml_backend_dev_name(dev)); |
126 | 0 | } else { |
127 | 0 | free = ret.back().free; |
128 | 0 | total = ret.back().total; |
129 | 0 | } |
130 | 0 | } |
131 | 0 | ret[i].free = free; |
132 | 0 | ret[i].total = total; |
133 | 0 | } |
134 | |
|
135 | 0 | devs.clear(); |
136 | 0 | for (int i = 0; i < llama_model_n_devices(model); i++) { |
137 | 0 | devs.push_back(llama_model_get_device(model, i)); |
138 | 0 | } |
139 | |
|
140 | 0 | hp_ngl = llama_model_n_layer(model); |
141 | 0 | hp_n_ctx_train = llama_model_n_ctx_train(model); |
142 | 0 | hp_n_expert = llama_model_n_expert(model); |
143 | |
|
144 | 0 | common_memory_breakdown_print(ctx); |
145 | |
|
146 | 0 | llama_free(ctx); |
147 | 0 | llama_model_free(model); |
148 | 0 | llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); |
149 | |
|
150 | 0 | return ret; |
151 | 0 | } |
152 | | |
153 | | common_device_memory_data_vec common_get_device_memory_data( |
154 | | const char * path_model, |
155 | | const llama_model_params * mparams, |
156 | | const llama_context_params * cparams, |
157 | | std::vector<ggml_backend_dev_t> & devs, |
158 | | uint32_t & hp_ngl, |
159 | | uint32_t & hp_n_ctx_train, |
160 | | uint32_t & hp_n_expert, |
161 | 0 | ggml_log_level log_level) { |
162 | 0 | std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl( |
163 | 0 | path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level); |
164 | |
|
165 | 0 | common_device_memory_data_vec ret(impl.size()); |
166 | 0 | for (size_t i = 0; i < impl.size(); i++) { |
167 | 0 | ret[i].total = impl[i].total; |
168 | 0 | ret[i].free = impl[i].free; |
169 | 0 | ret[i].model = impl[i].mb.model; |
170 | 0 | ret[i].context = impl[i].mb.context; |
171 | 0 | ret[i].compute = impl[i].mb.compute; |
172 | 0 | } |
173 | 0 | return ret; |
174 | 0 | } |
175 | | |
176 | | static void common_params_fit_impl( |
177 | | const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, |
178 | | float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, |
179 | 0 | size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { |
180 | 0 | if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) { |
181 | 0 | throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort"); |
182 | 0 | } |
183 | 0 | constexpr int64_t MiB = 1024*1024; |
184 | 0 | typedef std::vector<llama_device_memory_data> dmds_t; |
185 | 0 | const llama_model_params default_mparams = llama_model_default_params(); |
186 | |
|
187 | 0 | std::vector<ggml_backend_dev_t> devs; |
188 | 0 | uint32_t hp_ngl = 0; // hparams.n_gpu_layers |
189 | 0 | uint32_t hp_nct = 0; // hparams.n_ctx_train |
190 | 0 | uint32_t hp_nex = 0; // hparams.n_expert |
191 | | |
192 | | // step 1: get data for default parameters and check whether any changes are necessary in the first place |
193 | |
|
194 | 0 | LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__); |
195 | 0 | const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
196 | 0 | const size_t nd = devs.size(); // number of devices |
197 | |
|
198 | 0 | std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits |
199 | 0 | margins.reserve(nd); |
200 | 0 | if (nd == 0) { |
201 | 0 | margins.push_back(margins_s[0]); |
202 | 0 | } else { |
203 | 0 | for (size_t id = 0; id < nd; id++) { |
204 | 0 | margins.push_back(margins_s[id]); |
205 | 0 | } |
206 | 0 | } |
207 | |
|
208 | 0 | std::vector<std::string> dev_names; |
209 | 0 | { |
210 | 0 | dev_names.reserve(nd); |
211 | 0 | size_t max_length = 0; |
212 | 0 | for (const auto & dev : devs) { |
213 | 0 | std::string name = ggml_backend_dev_name(dev); |
214 | 0 | name += " ("; |
215 | 0 | name += ggml_backend_dev_description(dev); |
216 | 0 | name += ")"; |
217 | 0 | dev_names.push_back(name); |
218 | 0 | max_length = std::max(max_length, name.length()); |
219 | 0 | } |
220 | 0 | for (std::string & dn : dev_names) { |
221 | 0 | dn.insert(dn.end(), max_length - dn.length(), ' '); |
222 | 0 | } |
223 | 0 | } |
224 | |
|
225 | 0 | int64_t sum_free = 0; |
226 | 0 | int64_t sum_projected_free = 0; |
227 | 0 | int64_t sum_projected_used = 0; |
228 | 0 | int64_t sum_projected_model = 0; |
229 | 0 | std::vector<int64_t> projected_free_per_device; |
230 | 0 | projected_free_per_device.reserve(nd); |
231 | |
|
232 | 0 | if (nd == 0) { |
233 | 0 | sum_projected_used = dmds_full.back().mb.total(); |
234 | 0 | sum_free = dmds_full.back().total; |
235 | 0 | sum_projected_free = sum_free - sum_projected_used; |
236 | 0 | LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n", |
237 | 0 | __func__, sum_projected_used/MiB, sum_free/MiB); |
238 | 0 | if (sum_projected_free >= margins[0]) { |
239 | 0 | LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n", |
240 | 0 | __func__, sum_projected_free/MiB, margins[0]/MiB); |
241 | 0 | return; |
242 | 0 | } |
243 | 0 | } else { |
244 | 0 | if (nd > 1) { |
245 | 0 | LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__); |
246 | 0 | } |
247 | 0 | for (size_t id = 0; id < nd; id++) { |
248 | 0 | const llama_device_memory_data & dmd = dmds_full[id]; |
249 | |
|
250 | 0 | const int64_t projected_used = dmd.mb.total(); |
251 | 0 | const int64_t projected_free = dmd.free - projected_used; |
252 | 0 | projected_free_per_device.push_back(projected_free); |
253 | |
|
254 | 0 | sum_free += dmd.free; |
255 | 0 | sum_projected_used += projected_used; |
256 | 0 | sum_projected_free += projected_free; |
257 | 0 | sum_projected_model += dmd.mb.model; |
258 | |
|
259 | 0 | if (nd > 1) { |
260 | 0 | LOG_TRC("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", |
261 | 0 | __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); |
262 | 0 | } |
263 | 0 | } |
264 | 0 | assert(sum_free >= 0 && sum_projected_used >= 0); |
265 | 0 | LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", |
266 | 0 | __func__, sum_projected_used/MiB, sum_free/MiB); |
267 | 0 | if (nd == 1) { |
268 | 0 | if (projected_free_per_device[0] >= margins[0]) { |
269 | 0 | LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", |
270 | 0 | __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); |
271 | 0 | return; |
272 | 0 | } |
273 | 0 | } else { |
274 | 0 | bool changes_needed = false; |
275 | 0 | for (size_t id = 0; id < nd; id++) { |
276 | 0 | if (projected_free_per_device[id] < margins[id]) { |
277 | 0 | changes_needed = true; |
278 | 0 | break; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | if (!changes_needed) { |
282 | 0 | LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); |
283 | 0 | return; |
284 | 0 | } |
285 | 0 | } |
286 | 0 | } |
287 | | |
288 | | // step 2: try reducing memory use by reducing the context size |
289 | | |
290 | 0 | { |
291 | 0 | int64_t global_surplus = sum_projected_free; |
292 | 0 | if (nd == 0) { |
293 | 0 | global_surplus -= margins[0]; |
294 | 0 | } else { |
295 | 0 | for (size_t id = 0; id < nd; id++) { |
296 | 0 | global_surplus -= margins[id]; |
297 | 0 | } |
298 | 0 | } |
299 | 0 | if (global_surplus < 0) { |
300 | 0 | if (nd <= 1) { |
301 | 0 | LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", |
302 | 0 | __func__, margins[0]/MiB, -global_surplus/MiB); |
303 | 0 | } else { |
304 | 0 | LOG_TRC( |
305 | 0 | "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n", |
306 | 0 | __func__, -global_surplus/MiB); |
307 | 0 | } |
308 | 0 | if (cparams->n_ctx == 0) { |
309 | 0 | if (hp_nct > n_ctx_min) { |
310 | 0 | int64_t sum_used_target = sum_free; |
311 | 0 | if (nd == 0) { |
312 | 0 | sum_used_target -= margins[0]; |
313 | 0 | } else { |
314 | 0 | for (size_t id = 0; id < nd; id++) { |
315 | 0 | sum_used_target -= margins[id]; |
316 | 0 | } |
317 | 0 | } |
318 | 0 | if (nd > 1) { |
319 | | // for multiple devices we need to be more conservative in terms of how much context we think can fit: |
320 | | // - for dense models only whole layers can be assigned to devices |
321 | | // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer |
322 | | // - on average we expect a waste of 0.5 layers/tensors per device |
323 | | // - use slightly more than the expected average for nd devices to be safe |
324 | 0 | const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl); |
325 | 0 | sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6); |
326 | 0 | } |
327 | |
|
328 | 0 | int64_t sum_projected_used_min_ctx = 0; |
329 | 0 | cparams->n_ctx = n_ctx_min; |
330 | 0 | const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
331 | 0 | if (nd == 0) { |
332 | 0 | sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total(); |
333 | 0 | } else { |
334 | 0 | for (size_t id = 0; id < nd; id++) { |
335 | 0 | sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total(); |
336 | 0 | } |
337 | 0 | } |
338 | 0 | if (sum_used_target > sum_projected_used_min_ctx) { |
339 | | // linear interpolation between minimum and maximum context size: |
340 | 0 | cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx) |
341 | 0 | / (sum_projected_used - sum_projected_used_min_ctx); |
342 | 0 | cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend |
343 | |
|
344 | 0 | const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min); |
345 | 0 | const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; |
346 | 0 | LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", |
347 | 0 | __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); |
348 | 0 | if (nd <= 1) { |
349 | 0 | LOG_TRC("%s: entire model can be fit by reducing context\n", __func__); |
350 | 0 | return; |
351 | 0 | } |
352 | 0 | LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__); |
353 | 0 | } else { |
354 | 0 | const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx; |
355 | 0 | LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", |
356 | 0 | __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); |
357 | 0 | } |
358 | 0 | } else { |
359 | 0 | if (n_ctx_min == UINT32_MAX) { |
360 | 0 | LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); |
361 | 0 | } else { |
362 | 0 | LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", |
363 | 0 | __func__, hp_nct, n_ctx_min); |
364 | 0 | } |
365 | 0 | } |
366 | 0 | } else { |
367 | 0 | LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); |
368 | 0 | } |
369 | 0 | } |
370 | 0 | } |
371 | 0 | if (nd == 0) { |
372 | 0 | throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort"); |
373 | 0 | } |
374 | | |
375 | 0 | if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { |
376 | 0 | throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); |
377 | 0 | } |
378 | 0 | if (nd > 1) { |
379 | 0 | if (!tensor_split) { |
380 | 0 | throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort"); |
381 | 0 | } |
382 | 0 | if (mparams->tensor_split) { |
383 | 0 | for (size_t id = 0; id < nd; id++) { |
384 | 0 | if (mparams->tensor_split[id] != 0.0f) { |
385 | 0 | throw common_params_fit_exception("model_params::tensor_split already set by user, abort"); |
386 | 0 | } |
387 | 0 | } |
388 | 0 | } |
389 | 0 | if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { |
390 | 0 | throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); |
391 | 0 | } |
392 | 0 | } |
393 | 0 | if (!tensor_buft_overrides) { |
394 | 0 | throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort"); |
395 | 0 | } |
396 | 0 | if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { |
397 | 0 | throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort"); |
398 | 0 | } |
399 | | |
400 | | // step 3: iteratively fill the back to front with "dense" layers |
401 | | // - for a dense model simply fill full layers, giving each device a contiguous slice of the model |
402 | | // - for a MoE model, same as dense model but with all MoE tensors in system memory |
403 | | |
404 | | // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction: |
405 | 0 | auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * { |
406 | 0 | constexpr size_t n_strings = 1000; |
407 | 0 | if (il >= n_strings) { |
408 | 0 | throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported"); |
409 | 0 | } |
410 | 0 | switch (lf) { |
411 | 0 | case LAYER_FRACTION_ATTN: { |
412 | 0 | static std::array<std::string, n_strings> patterns; |
413 | 0 | if (patterns[il].empty()) { |
414 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*"; |
415 | 0 | } |
416 | 0 | return patterns[il].c_str(); |
417 | 0 | } |
418 | 0 | case LAYER_FRACTION_UP: { |
419 | 0 | static std::array<std::string, n_strings> patterns; |
420 | 0 | if (patterns[il].empty()) { |
421 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*"; |
422 | 0 | } |
423 | 0 | return patterns[il].c_str(); |
424 | 0 | } |
425 | 0 | case LAYER_FRACTION_GATE: { |
426 | 0 | static std::array<std::string, n_strings> patterns; |
427 | 0 | if (patterns[il].empty()) { |
428 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*"; |
429 | 0 | } |
430 | 0 | return patterns[il].c_str(); |
431 | 0 | } |
432 | 0 | case LAYER_FRACTION_MOE: { |
433 | 0 | static std::array<std::string, n_strings> patterns; |
434 | 0 | if (patterns[il].empty()) { |
435 | 0 | patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; |
436 | 0 | } |
437 | 0 | return patterns[il].c_str(); |
438 | 0 | } |
439 | 0 | default: |
440 | 0 | GGML_ABORT("fatal error"); |
441 | 0 | } |
442 | 0 | }; |
443 | |
|
444 | 0 | struct ngl_t { |
445 | 0 | uint32_t n_layer = 0; // number of total layers |
446 | 0 | uint32_t n_part = 0; // number of partial layers, <= n_layer |
447 | | |
448 | | // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE: |
449 | 0 | common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE; |
450 | |
|
451 | 0 | uint32_t n_full() const { |
452 | 0 | assert(n_layer >= n_part); |
453 | 0 | return n_layer - n_part; |
454 | 0 | } |
455 | 0 | }; |
456 | |
|
457 | 0 | const size_t ntbo = llama_max_tensor_buft_overrides(); |
458 | | |
459 | | // utility function to set n_gpu_layers and tensor_split |
460 | 0 | auto set_ngl_tensor_split_tbo = [&]( |
461 | 0 | const std::vector<ngl_t> & ngl_per_device, |
462 | 0 | const std::vector<ggml_backend_buffer_type_t> & overflow_bufts, |
463 | 0 | llama_model_params & mparams) { |
464 | 0 | mparams.n_gpu_layers = 0; |
465 | 0 | for (size_t id = 0; id < nd; id++) { |
466 | 0 | mparams.n_gpu_layers += ngl_per_device[id].n_layer; |
467 | 0 | if (nd > 1) { |
468 | 0 | tensor_split[id] = ngl_per_device[id].n_layer; |
469 | 0 | } |
470 | 0 | } |
471 | 0 | assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1); |
472 | 0 | uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides |
473 | |
|
474 | 0 | mparams.tensor_split = tensor_split; |
475 | |
|
476 | 0 | size_t itbo = 0; |
477 | 0 | for (size_t id = 0; id < nd; id++) { |
478 | 0 | il0 += ngl_per_device[id].n_full(); |
479 | 0 | for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) { |
480 | 0 | if (itbo + 1 >= ntbo) { |
481 | 0 | tensor_buft_overrides[itbo].pattern = nullptr; |
482 | 0 | tensor_buft_overrides[itbo].buft = nullptr; |
483 | 0 | itbo++; |
484 | 0 | mparams.tensor_buft_overrides = tensor_buft_overrides; |
485 | 0 | throw common_params_fit_exception("llama_max_tensor_buft_overrides() == " |
486 | 0 | + std::to_string(ntbo) + " is insufficient for model"); |
487 | 0 | } |
488 | 0 | tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE); |
489 | 0 | tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type(); |
490 | 0 | itbo++; |
491 | 0 | } |
492 | 0 | il0 += ngl_per_device[id].n_part; |
493 | 0 | } |
494 | 0 | tensor_buft_overrides[itbo].pattern = nullptr; |
495 | 0 | tensor_buft_overrides[itbo].buft = nullptr; |
496 | 0 | itbo++; |
497 | 0 | mparams.tensor_buft_overrides = tensor_buft_overrides; |
498 | 0 | }; |
499 | | |
500 | | // utility function that returns the memory use per device for given numbers of layers per device |
501 | 0 | auto get_memory_for_layers = [&]( |
502 | 0 | const char * func_name, |
503 | 0 | const std::vector<ngl_t> & ngl_per_device, |
504 | 0 | const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> { |
505 | 0 | llama_model_params mparams_copy = *mparams; |
506 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); |
507 | |
|
508 | 0 | const dmds_t dmd_nl = common_get_device_memory_data_impl( |
509 | 0 | path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
510 | |
|
511 | 0 | LOG_TRC("%s: memory for test allocation by device:\n", func_name); |
512 | 0 | for (size_t id = 0; id < nd; id++) { |
513 | 0 | const ngl_t & n = ngl_per_device[id]; |
514 | 0 | LOG_TRC( |
515 | 0 | "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n", |
516 | 0 | func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB); |
517 | 0 | } |
518 | |
|
519 | 0 | std::vector<int64_t> ret; |
520 | 0 | ret.reserve(nd); |
521 | 0 | for (size_t id = 0; id < nd; id++) { |
522 | 0 | ret.push_back(dmd_nl[id].mb.total()); |
523 | 0 | } |
524 | 0 | return ret; |
525 | 0 | }; |
526 | |
|
527 | 0 | int64_t global_surplus_cpu_moe = 0; |
528 | 0 | if (hp_nex > 0) { |
529 | 0 | const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors |
530 | 0 | ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type(); |
531 | 0 | tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft}; |
532 | 0 | tensor_buft_overrides[1] = {nullptr, nullptr}; |
533 | 0 | mparams->tensor_buft_overrides = tensor_buft_overrides; |
534 | |
|
535 | 0 | LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); |
536 | 0 | const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl( |
537 | 0 | path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); |
538 | |
|
539 | 0 | for (size_t id = 0; id < nd; id++) { |
540 | 0 | global_surplus_cpu_moe += dmds_cpu_moe[id].free; |
541 | 0 | global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id]; |
542 | 0 | } |
543 | |
|
544 | 0 | if (global_surplus_cpu_moe > 0) { |
545 | 0 | LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", |
546 | 0 | __func__, global_surplus_cpu_moe/MiB); |
547 | 0 | } else { |
548 | 0 | LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", |
549 | 0 | __func__, -global_surplus_cpu_moe/MiB); |
550 | 0 | } |
551 | | |
552 | | // reset |
553 | 0 | tensor_buft_overrides[0] = {nullptr, nullptr}; |
554 | 0 | mparams->tensor_buft_overrides = tensor_buft_overrides; |
555 | 0 | } |
556 | |
|
557 | 0 | std::vector<int64_t> targets; // maximum acceptable memory use per device |
558 | 0 | targets.reserve(nd); |
559 | 0 | for (size_t id = 0; id < nd; id++) { |
560 | 0 | targets.push_back(dmds_full[id].free - margins[id]); |
561 | 0 | LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); |
562 | 0 | } |
563 | |
|
564 | 0 | std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to: |
565 | 0 | overflow_bufts.reserve(nd); |
566 | 0 | for (size_t id = 0; id < nd; id++) { |
567 | 0 | overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); |
568 | 0 | } |
569 | |
|
570 | 0 | std::vector<ngl_t> ngl_per_device(nd); |
571 | 0 | std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts); |
572 | | |
573 | | // optimize the number of layers per device using the method of false position: |
574 | | // - ngl_per_device has 0 layers for each device, lower bound |
575 | | // - try a "high" configuration where a device is given all unassigned layers |
576 | | // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target |
577 | | // - check memory use of our guess, replace either the low or high bound |
578 | | // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits |
579 | | // - the last device has the output layer, which cannot be a partial layer |
580 | 0 | if (hp_nex == 0) { |
581 | 0 | LOG_TRC("%s: filling dense layers back-to-front:\n", __func__); |
582 | 0 | } else { |
583 | 0 | LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__); |
584 | 0 | } |
585 | 0 | for (int id = nd - 1; id >= 0; id--) { |
586 | 0 | uint32_t n_unassigned = hp_ngl + 1; |
587 | 0 | for (size_t jd = id + 1; jd < nd; ++jd) { |
588 | 0 | assert(n_unassigned >= ngl_per_device[jd].n_layer); |
589 | 0 | n_unassigned -= ngl_per_device[jd].n_layer; |
590 | 0 | } |
591 | |
|
592 | 0 | std::vector<ngl_t> ngl_per_device_high = ngl_per_device; |
593 | 0 | ngl_per_device_high[id].n_layer = n_unassigned; |
594 | 0 | if (hp_nex > 0) { |
595 | 0 | ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1; |
596 | 0 | } |
597 | 0 | if (ngl_per_device_high[id].n_layer > 0) { |
598 | 0 | std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); |
599 | 0 | if (mem_high[id] > targets[id]) { |
600 | 0 | assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); |
601 | 0 | uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; |
602 | 0 | LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); |
603 | 0 | while (delta > 1) { |
604 | 0 | uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); |
605 | 0 | step_size = std::max(step_size, uint32_t(1)); |
606 | 0 | step_size = std::min(step_size, delta - 1); |
607 | |
|
608 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
609 | 0 | ngl_per_device_test[id].n_layer += step_size; |
610 | 0 | if (hp_nex) { |
611 | 0 | ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ? |
612 | 0 | step_size - 1 : step_size; // the first layer is the output layer which must always be full |
613 | 0 | } |
614 | 0 | const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); |
615 | |
|
616 | 0 | if (mem_test[id] <= targets[id]) { |
617 | 0 | ngl_per_device = ngl_per_device_test; |
618 | 0 | mem = mem_test; |
619 | 0 | LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); |
620 | 0 | } else { |
621 | 0 | ngl_per_device_high = ngl_per_device_test; |
622 | 0 | mem_high = mem_test; |
623 | 0 | LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); |
624 | 0 | } |
625 | 0 | delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; |
626 | 0 | } |
627 | 0 | } else { |
628 | 0 | assert(ngl_per_device_high[id].n_layer == n_unassigned); |
629 | 0 | ngl_per_device = ngl_per_device_high; |
630 | 0 | mem = mem_high; |
631 | 0 | LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); |
632 | 0 | } |
633 | 0 | } |
634 | |
|
635 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
636 | 0 | LOG_TRC( |
637 | 0 | "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
638 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); |
639 | 0 | } |
640 | 0 | if (hp_nex == 0 || global_surplus_cpu_moe <= 0) { |
641 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); |
642 | 0 | return; |
643 | 0 | } |
644 | | |
645 | | // step 4: for a MoE model where all dense tensors fit, |
646 | | // convert the dense-only layers in the back to full layers in the front until all devices are full |
647 | | // essentially the same procedure as for the dense-only layers except front-to-back |
648 | | // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM |
649 | | |
650 | 0 | size_t id_dense_start = nd; |
651 | 0 | for (int id = nd - 1; id >= 0; id--) { |
652 | 0 | if (ngl_per_device[id].n_layer > 0) { |
653 | 0 | id_dense_start = id; |
654 | 0 | continue; |
655 | 0 | } |
656 | 0 | break; |
657 | 0 | } |
658 | 0 | assert(id_dense_start < nd); |
659 | |
|
660 | 0 | LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__); |
661 | 0 | for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) { |
662 | 0 | std::vector<ngl_t> ngl_per_device_high = ngl_per_device; |
663 | 0 | for (size_t jd = id_dense_start; jd < nd; jd++) { |
664 | 0 | const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1; |
665 | 0 | ngl_per_device_high[id].n_layer += n_layer_move; |
666 | 0 | ngl_per_device_high[jd].n_layer -= n_layer_move; |
667 | 0 | ngl_per_device_high[jd].n_part = 0; |
668 | 0 | } |
669 | 0 | size_t id_dense_start_high = nd - 1; |
670 | 0 | std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); |
671 | |
|
672 | 0 | if (mem_high[id] > targets[id]) { |
673 | 0 | assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); |
674 | 0 | uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); |
675 | 0 | while (delta > 1) { |
676 | 0 | uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); |
677 | 0 | step_size = std::max(step_size, uint32_t(1)); |
678 | 0 | step_size = std::min(step_size, delta - 1); |
679 | |
|
680 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
681 | 0 | size_t id_dense_start_test = id_dense_start; |
682 | 0 | uint32_t n_converted_test = 0; |
683 | 0 | for (;id_dense_start_test < nd; id_dense_start_test++) { |
684 | 0 | const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part); |
685 | 0 | ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd; |
686 | 0 | ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd; |
687 | 0 | ngl_per_device_test[id].n_layer += n_convert_jd; |
688 | 0 | n_converted_test += n_convert_jd; |
689 | |
|
690 | 0 | if (ngl_per_device_test[id_dense_start_test].n_part > 0) { |
691 | 0 | break; |
692 | 0 | } |
693 | 0 | } |
694 | 0 | const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); |
695 | |
|
696 | 0 | if (mem_test[id] <= targets[id]) { |
697 | 0 | ngl_per_device = ngl_per_device_test; |
698 | 0 | mem = mem_test; |
699 | 0 | id_dense_start = id_dense_start_test; |
700 | 0 | LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", |
701 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
702 | 0 | } else { |
703 | 0 | ngl_per_device_high = ngl_per_device_test; |
704 | 0 | mem_high = mem_test; |
705 | 0 | id_dense_start_high = id_dense_start_test; |
706 | 0 | LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n", |
707 | 0 | __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high); |
708 | 0 | } |
709 | 0 | assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); |
710 | 0 | delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full(); |
711 | 0 | } |
712 | 0 | } else { |
713 | 0 | ngl_per_device = ngl_per_device_high; |
714 | 0 | mem = mem_high; |
715 | 0 | id_dense_start = id_dense_start_high; |
716 | 0 | LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", |
717 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
718 | 0 | } |
719 | | |
720 | | // try to fit at least part of one more layer |
721 | 0 | if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) { |
722 | 0 | std::vector<ngl_t> ngl_per_device_test = ngl_per_device; |
723 | 0 | size_t id_dense_start_test = id_dense_start; |
724 | 0 | ngl_per_device_test[id_dense_start_test].n_layer--; |
725 | 0 | ngl_per_device_test[id_dense_start_test].n_part--; |
726 | 0 | ngl_per_device_test[id].n_layer++; |
727 | 0 | ngl_per_device_test[id].n_part++; |
728 | 0 | if (ngl_per_device_test[id_dense_start_test].n_part == 0) { |
729 | 0 | id_dense_start_test++; |
730 | 0 | } |
731 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP; |
732 | 0 | std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts; |
733 | 0 | if (id < nd - 1) { |
734 | 0 | overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]); |
735 | 0 | } |
736 | 0 | LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); |
737 | 0 | std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
738 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
739 | 0 | ngl_per_device = ngl_per_device_test; |
740 | 0 | overflow_bufts = overflow_bufts_test; |
741 | 0 | mem = mem_test; |
742 | 0 | id_dense_start = id_dense_start_test; |
743 | 0 | LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n", |
744 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
745 | |
|
746 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; |
747 | 0 | LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); |
748 | 0 | mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
749 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
750 | 0 | ngl_per_device = ngl_per_device_test; |
751 | 0 | overflow_bufts = overflow_bufts_test; |
752 | 0 | mem = mem_test; |
753 | 0 | id_dense_start = id_dense_start_test; |
754 | 0 | LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n", |
755 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
756 | 0 | } |
757 | 0 | } else { |
758 | 0 | ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; |
759 | 0 | LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); |
760 | 0 | mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); |
761 | 0 | if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { |
762 | 0 | ngl_per_device = ngl_per_device_test; |
763 | 0 | overflow_bufts = overflow_bufts_test; |
764 | 0 | mem = mem_test; |
765 | 0 | id_dense_start = id_dense_start_test; |
766 | 0 | LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n", |
767 | 0 | __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); |
768 | 0 | } |
769 | 0 | } |
770 | 0 | } |
771 | |
|
772 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
773 | 0 | LOG_TRC( |
774 | 0 | "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
775 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); |
776 | 0 | } |
777 | | |
778 | | // print info for devices that were not changed during the conversion from dense only to full layers: |
779 | 0 | for (size_t id = id_dense_start + 1; id < nd; id++) { |
780 | 0 | const int64_t projected_margin = dmds_full[id].free - mem[id]; |
781 | 0 | LOG_TRC( |
782 | 0 | "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", |
783 | 0 | __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); |
784 | 0 | } |
785 | |
|
786 | 0 | set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); |
787 | 0 | } |
788 | | |
789 | | enum common_params_fit_status common_fit_params( |
790 | | const char * path_model, |
791 | | llama_model_params * mparams, |
792 | | llama_context_params * cparams, |
793 | | float * tensor_split, |
794 | | llama_model_tensor_buft_override * tensor_buft_overrides, |
795 | | size_t * margins, |
796 | | uint32_t n_ctx_min, |
797 | 0 | ggml_log_level log_level) { |
798 | 0 | const int64_t t0_us = llama_time_us(); |
799 | 0 | common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS; |
800 | 0 | try { |
801 | 0 | common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level); |
802 | 0 | LOG_TRC("%s: successfully fit params to free device memory\n", __func__); |
803 | 0 | } catch (const common_params_fit_exception & e) { |
804 | 0 | LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); |
805 | 0 | status = COMMON_PARAMS_FIT_STATUS_FAILURE; |
806 | 0 | } catch (const std::runtime_error & e) { |
807 | 0 | LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what()); |
808 | 0 | status = COMMON_PARAMS_FIT_STATUS_ERROR; |
809 | 0 | } |
810 | 0 | const int64_t t1_us = llama_time_us(); |
811 | 0 | LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); |
812 | 0 | return status; |
813 | 0 | } |
814 | | |
815 | 0 | void common_memory_breakdown_print(const struct llama_context * ctx) { |
816 | | //const auto & devices = ctx->get_model().devices; |
817 | 0 | const auto * model = llama_get_model(ctx); |
818 | |
|
819 | 0 | std::vector<ggml_backend_dev_t> devices; |
820 | 0 | for (int i = 0; i < llama_model_n_devices(model); i++) { |
821 | 0 | devices.push_back(llama_model_get_device(model, i)); |
822 | 0 | } |
823 | |
|
824 | 0 | llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx); |
825 | |
|
826 | 0 | std::vector<std::array<std::string, 9>> table_data; |
827 | 0 | table_data.reserve(devices.size()); |
828 | 0 | const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n"; |
829 | 0 | const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n"; |
830 | 0 | const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n"; |
831 | |
|
832 | 0 | table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"}); |
833 | |
|
834 | 0 | constexpr size_t MiB = 1024 * 1024; |
835 | 0 | const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "}; |
836 | | |
837 | | // track seen buffer types to avoid double counting: |
838 | 0 | std::set<ggml_backend_buffer_type_t> seen_buffer_types; |
839 | | |
840 | | // accumulative memory breakdown for each device and for host: |
841 | 0 | std::vector<llama_memory_breakdown_data> mb_dev(devices.size()); |
842 | 0 | llama_memory_breakdown_data mb_host; |
843 | |
|
844 | 0 | for (const auto & buft_mb : memory_breakdown) { |
845 | 0 | ggml_backend_buffer_type_t buft = buft_mb.first; |
846 | 0 | const llama_memory_breakdown_data & mb = buft_mb.second; |
847 | 0 | if (ggml_backend_buft_is_host(buft)) { |
848 | 0 | mb_host.model += mb.model; |
849 | 0 | mb_host.context += mb.context; |
850 | 0 | mb_host.compute += mb.compute; |
851 | 0 | seen_buffer_types.insert(buft); |
852 | 0 | continue; |
853 | 0 | } |
854 | 0 | ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); |
855 | 0 | if (dev) { |
856 | 0 | int i_dev = -1; |
857 | 0 | for (size_t i = 0; i < devices.size(); i++) { |
858 | 0 | if (devices[i] == dev) { |
859 | 0 | i_dev = i; |
860 | 0 | break; |
861 | 0 | } |
862 | 0 | } |
863 | 0 | if (i_dev != -1) { |
864 | 0 | mb_dev[i_dev].model += mb.model; |
865 | 0 | mb_dev[i_dev].context += mb.context; |
866 | 0 | mb_dev[i_dev].compute += mb.compute; |
867 | 0 | seen_buffer_types.insert(buft); |
868 | 0 | continue; |
869 | 0 | } |
870 | 0 | } |
871 | 0 | } |
872 | | |
873 | | // print memory breakdown for each device: |
874 | 0 | for (size_t i = 0; i < devices.size(); i++) { |
875 | 0 | ggml_backend_dev_t dev = devices[i]; |
876 | 0 | llama_memory_breakdown_data mb = mb_dev[i]; |
877 | |
|
878 | 0 | const std::string name = ggml_backend_dev_name(dev); |
879 | 0 | std::string desc = ggml_backend_dev_description(dev); |
880 | 0 | for (const std::string & prefix : desc_prefixes_strip) { |
881 | 0 | if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) { |
882 | 0 | desc = desc.substr(prefix.length()); |
883 | 0 | } |
884 | 0 | } |
885 | |
|
886 | 0 | size_t free, total; |
887 | 0 | ggml_backend_dev_memory(dev, &free, &total); |
888 | |
|
889 | 0 | const size_t self = mb.model + mb.context + mb.compute; |
890 | 0 | const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self); |
891 | |
|
892 | 0 | table_data.push_back({ |
893 | 0 | template_gpu, |
894 | 0 | " - " + name + " (" + desc + ")", |
895 | 0 | std::to_string(total / MiB), |
896 | 0 | std::to_string(free / MiB), |
897 | 0 | std::to_string(self / MiB), |
898 | 0 | std::to_string(mb.model / MiB), |
899 | 0 | std::to_string(mb.context / MiB), |
900 | 0 | std::to_string(mb.compute / MiB), |
901 | 0 | std::to_string(unaccounted / static_cast<int64_t>(MiB))}); |
902 | 0 | } |
903 | | |
904 | | // print memory breakdown for host: |
905 | 0 | { |
906 | 0 | const size_t self = mb_host.model + mb_host.context + mb_host.compute; |
907 | 0 | table_data.push_back({ |
908 | 0 | template_other, |
909 | 0 | " - Host", |
910 | 0 | "", // total |
911 | 0 | "", // free |
912 | 0 | std::to_string(self / MiB), |
913 | 0 | std::to_string(mb_host.model / MiB), |
914 | 0 | std::to_string(mb_host.context / MiB), |
915 | 0 | std::to_string(mb_host.compute / MiB), |
916 | 0 | ""}); // unaccounted |
917 | 0 | } |
918 | | |
919 | | // print memory breakdown for all remaining buffer types: |
920 | 0 | for (const auto & buft_mb : memory_breakdown) { |
921 | 0 | ggml_backend_buffer_type_t buft = buft_mb.first; |
922 | 0 | const llama_memory_breakdown_data & mb = buft_mb.second; |
923 | 0 | if (seen_buffer_types.count(buft) == 1) { |
924 | 0 | continue; |
925 | 0 | } |
926 | 0 | const std::string name = ggml_backend_buft_name(buft); |
927 | 0 | const size_t self = mb.model + mb.context + mb.compute; |
928 | 0 | table_data.push_back({ |
929 | 0 | template_other, |
930 | 0 | " - " + name, |
931 | 0 | "", // total |
932 | 0 | "", // free |
933 | 0 | std::to_string(self / MiB), |
934 | 0 | std::to_string(mb.model / MiB), |
935 | 0 | std::to_string(mb.context / MiB), |
936 | 0 | std::to_string(mb.compute / MiB), |
937 | 0 | ""}); // unaccounted |
938 | 0 | seen_buffer_types.insert(buft); |
939 | 0 | } |
940 | |
|
941 | 0 | for (size_t j = 1; j < table_data[0].size(); j++) { |
942 | 0 | size_t max_len = 0; |
943 | 0 | for (const auto & td : table_data) { |
944 | 0 | max_len = std::max(max_len, td[j].length()); |
945 | 0 | } |
946 | 0 | for (auto & td : table_data) { |
947 | 0 | td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' '); |
948 | 0 | } |
949 | 0 | } |
950 | 0 | for (const auto & td : table_data) { |
951 | 0 | LOG_TRC(td[0].c_str(), |
952 | 0 | __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(), |
953 | 0 | td[6].c_str(), td[7].c_str(), td[8].c_str()); |
954 | 0 | } |
955 | 0 | } |
956 | | |
957 | | void common_fit_print( |
958 | | const char * path_model, |
959 | | llama_model_params * mparams, |
960 | 0 | llama_context_params * cparams) { |
961 | 0 | std::vector<ggml_backend_dev_t> devs; |
962 | 0 | uint32_t hp_ngl = 0; // hparams.n_gpu_layers |
963 | 0 | uint32_t hp_nct = 0; // hparams.n_ctx_train |
964 | 0 | uint32_t hp_nex = 0; // hparams.n_expert |
965 | |
|
966 | 0 | auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); |
967 | 0 | GGML_ASSERT(dmd.size() == devs.size() + 1); |
968 | |
|
969 | 0 | for (size_t id = 0; id < devs.size(); id++) { |
970 | 0 | printf("%s ", ggml_backend_dev_name(devs[id])); |
971 | 0 | printf("%zu ", dmd[id].mb.model/1024/1024); |
972 | 0 | printf("%zu ", dmd[id].mb.context/1024/1024); |
973 | 0 | printf("%zu ", dmd[id].mb.compute/1024/1024); |
974 | 0 | printf("\n"); |
975 | 0 | } |
976 | |
|
977 | 0 | printf("Host "); |
978 | 0 | printf("%zu ", dmd.back().mb.model/1024/1024); |
979 | 0 | printf("%zu ", dmd.back().mb.context/1024/1024); |
980 | 0 | printf("%zu ", dmd.back().mb.compute/1024/1024); |
981 | 0 | printf("\n"); |
982 | 0 | } |