/src/llama.cpp/src/llama.cpp
Line | Count | Source |
1 | | #include "llama-impl.h" |
2 | | |
3 | | #include "llama-chat.h" |
4 | | #include "llama-mmap.h" |
5 | | #include "llama-vocab.h" |
6 | | #include "llama-model-loader.h" |
7 | | #include "llama-model-saver.h" |
8 | | #include "llama-model.h" |
9 | | |
10 | | #include "ggml.h" |
11 | | #include "ggml-backend.h" |
12 | | |
13 | | #include <algorithm> |
14 | | #include <cstddef> |
15 | | #include <cstdint> |
16 | | #include <cstdio> |
17 | | #include <cstring> |
18 | | #include <ctime> |
19 | | |
20 | | #if defined(_MSC_VER) |
21 | | #pragma warning(disable: 4244 4267) // possible loss of data |
22 | | #endif |
23 | | |
24 | | // |
25 | | // interface implementation |
26 | | // |
27 | | |
28 | 0 | const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) { |
29 | 0 | switch (flash_attn_type) { |
30 | 0 | case LLAMA_FLASH_ATTN_TYPE_AUTO: |
31 | 0 | return "auto"; |
32 | 0 | case LLAMA_FLASH_ATTN_TYPE_DISABLED: |
33 | 0 | return "disabled"; |
34 | 0 | case LLAMA_FLASH_ATTN_TYPE_ENABLED: |
35 | 0 | return "enabled"; |
36 | 0 | } |
37 | 0 | GGML_ABORT("fatal error"); |
38 | 0 | } |
39 | | |
40 | 0 | struct llama_sampler_chain_params llama_sampler_chain_default_params() { |
41 | 0 | struct llama_sampler_chain_params result = { |
42 | 0 | /*.no_perf =*/ true, |
43 | 0 | }; |
44 | |
|
45 | 0 | return result; |
46 | 0 | } |
47 | | |
48 | 0 | size_t llama_max_devices(void) { |
49 | 0 | return 16; |
50 | 0 | } |
51 | | |
52 | 0 | bool llama_supports_mmap(void) { |
53 | 0 | return llama_mmap::SUPPORTED; |
54 | 0 | } |
55 | | |
56 | 0 | bool llama_supports_mlock(void) { |
57 | 0 | return llama_mlock::SUPPORTED; |
58 | 0 | } |
59 | | |
60 | 0 | bool llama_supports_gpu_offload(void) { |
61 | 0 | return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || |
62 | 0 | ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr || |
63 | 0 | llama_supports_rpc(); |
64 | 0 | } |
65 | | |
66 | 0 | bool llama_supports_rpc(void) { |
67 | 0 | return ggml_backend_reg_by_name("RPC") != nullptr; |
68 | 0 | } |
69 | | |
70 | 0 | void llama_backend_init(void) { |
71 | 0 | ggml_time_init(); |
72 | | |
73 | | // needed to initialize f16 tables |
74 | 0 | { |
75 | 0 | struct ggml_init_params params = { 0, NULL, false }; |
76 | 0 | struct ggml_context * ctx = ggml_init(params); |
77 | 0 | ggml_free(ctx); |
78 | 0 | } |
79 | 0 | } |
80 | | |
81 | 0 | void llama_numa_init(enum ggml_numa_strategy numa) { |
82 | 0 | if (numa != GGML_NUMA_STRATEGY_DISABLED) { |
83 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
84 | 0 | GGML_ASSERT(dev && "CPU backend is not loaded"); |
85 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
86 | 0 | auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); |
87 | 0 | if (numa_init_fn) { |
88 | 0 | numa_init_fn(numa); |
89 | 0 | } |
90 | 0 | } |
91 | 0 | } |
92 | | |
93 | 0 | void llama_backend_free(void) { |
94 | 0 | ggml_quantize_free(); |
95 | 0 | } |
96 | | |
97 | 0 | int64_t llama_time_us(void) { |
98 | 0 | return ggml_time_us(); |
99 | 0 | } |
100 | | |
101 | | // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback |
102 | 0 | static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) { |
103 | | // loading time will be recalculated after the first eval, so |
104 | | // we take page faults deferred by mmap() into consideration |
105 | 0 | model.t_load_us = 0; |
106 | 0 | time_meas tm(model.t_load_us); |
107 | |
|
108 | 0 | model.t_start_us = tm.t_start_us; |
109 | |
|
110 | 0 | try { |
111 | 0 | llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); |
112 | |
|
113 | 0 | ml.print_info(); |
114 | |
|
115 | 0 | model.hparams.vocab_only = params.vocab_only; |
116 | |
|
117 | 0 | try { |
118 | 0 | model.load_arch(ml); |
119 | 0 | } catch(const std::exception & e) { |
120 | 0 | throw std::runtime_error("error loading model architecture: " + std::string(e.what())); |
121 | 0 | } |
122 | 0 | try { |
123 | 0 | model.load_hparams(ml); |
124 | 0 | } catch(const std::exception & e) { |
125 | 0 | throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); |
126 | 0 | } |
127 | 0 | if (model.arch == LLM_ARCH_CLIP) { |
128 | 0 | throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); |
129 | 0 | } |
130 | 0 | try { |
131 | 0 | model.load_vocab(ml); |
132 | 0 | } catch(const std::exception & e) { |
133 | 0 | throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); |
134 | 0 | } |
135 | | |
136 | 0 | model.load_stats(ml); |
137 | 0 | model.print_info(); |
138 | |
|
139 | 0 | if (params.vocab_only) { |
140 | 0 | LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); |
141 | 0 | return 0; |
142 | 0 | } |
143 | | |
144 | 0 | if (!model.load_tensors(ml)) { |
145 | 0 | return -2; |
146 | 0 | } |
147 | 0 | } catch (const std::exception & err) { |
148 | 0 | LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); |
149 | 0 | return -1; |
150 | 0 | } |
151 | | |
152 | 0 | return 0; |
153 | 0 | } |
154 | | |
155 | | static struct llama_model * llama_model_load_from_file_impl( |
156 | | const std::string & path_model, |
157 | | std::vector<std::string> & splits, |
158 | 0 | struct llama_model_params params) { |
159 | 0 | ggml_time_init(); |
160 | |
|
161 | 0 | if (!params.vocab_only && ggml_backend_reg_count() == 0) { |
162 | 0 | LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); |
163 | 0 | return nullptr; |
164 | 0 | } |
165 | | |
166 | 0 | unsigned cur_percentage = 0; |
167 | 0 | if (params.progress_callback == NULL) { |
168 | 0 | params.progress_callback_user_data = &cur_percentage; |
169 | 0 | params.progress_callback = [](float progress, void * ctx) { |
170 | 0 | unsigned * cur_percentage_p = (unsigned *) ctx; |
171 | 0 | unsigned percentage = (unsigned) (100 * progress); |
172 | 0 | while (percentage > *cur_percentage_p) { |
173 | 0 | *cur_percentage_p = percentage; |
174 | 0 | LLAMA_LOG_CONT("."); |
175 | 0 | if (percentage >= 100) { |
176 | 0 | LLAMA_LOG_CONT("\n"); |
177 | 0 | } |
178 | 0 | } |
179 | 0 | return true; |
180 | 0 | }; |
181 | 0 | } |
182 | |
|
183 | 0 | llama_model * model = new llama_model(params); |
184 | | |
185 | | // create list of devices to use with this model |
186 | 0 | if (params.devices) { |
187 | 0 | for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { |
188 | 0 | model->devices.push_back(*dev); |
189 | 0 | } |
190 | 0 | } else { |
191 | | // default device selection |
192 | | |
193 | | // build list of available devices |
194 | 0 | std::vector<ggml_backend_dev_t> gpus; |
195 | 0 | std::vector<ggml_backend_dev_t> igpus; |
196 | 0 | std::vector<ggml_backend_dev_t> rpc_servers; |
197 | |
|
198 | 0 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
199 | 0 | ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
200 | 0 | switch (ggml_backend_dev_type(dev)) { |
201 | 0 | case GGML_BACKEND_DEVICE_TYPE_CPU: |
202 | 0 | case GGML_BACKEND_DEVICE_TYPE_ACCEL: |
203 | | // skip CPU backends since they are handled separately |
204 | 0 | break; |
205 | | |
206 | 0 | case GGML_BACKEND_DEVICE_TYPE_GPU: { |
207 | 0 | ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); |
208 | 0 | if (ggml_backend_reg_name(reg) == std::string("RPC")) { |
209 | 0 | rpc_servers.push_back(dev); |
210 | 0 | } else { |
211 | | // check if there is already a GPU with the same device id |
212 | 0 | ggml_backend_dev_props props; |
213 | 0 | ggml_backend_dev_get_props(dev, &props); |
214 | 0 | auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) { |
215 | 0 | ggml_backend_dev_props d_props; |
216 | 0 | ggml_backend_dev_get_props(d, &d_props); |
217 | 0 | if (props.device_id && d_props.device_id) { |
218 | 0 | return strcmp(props.device_id, d_props.device_id) == 0; |
219 | 0 | } |
220 | 0 | return false; |
221 | 0 | }); |
222 | |
|
223 | 0 | if (it != gpus.end()) { |
224 | 0 | LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n", |
225 | 0 | __func__, |
226 | 0 | ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), |
227 | 0 | props.device_id ? props.device_id : "unknown id", |
228 | 0 | ggml_backend_dev_name(*it), ggml_backend_dev_description(*it)); |
229 | 0 | } else { |
230 | 0 | gpus.push_back(dev); |
231 | 0 | } |
232 | 0 | } |
233 | 0 | break; |
234 | 0 | } |
235 | | |
236 | 0 | case GGML_BACKEND_DEVICE_TYPE_IGPU: |
237 | 0 | igpus.push_back(dev); |
238 | 0 | break; |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | | // add RPC servers at the front of the list to minimize network transfers |
243 | 0 | model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); |
244 | | |
245 | | // add GPUs |
246 | 0 | model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); |
247 | | |
248 | | // add integrated GPUs only if no other devices were found |
249 | 0 | if (model->devices.empty()) { |
250 | 0 | model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); |
251 | 0 | } |
252 | 0 | } |
253 | | |
254 | | // if using single GPU mode, remove all except the main GPU |
255 | 0 | if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { |
256 | 0 | if (params.main_gpu < 0) { |
257 | 0 | model->devices.clear(); |
258 | 0 | } else { |
259 | 0 | if (params.main_gpu >= (int)model->devices.size()) { |
260 | 0 | LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size()); |
261 | 0 | llama_model_free(model); |
262 | 0 | return nullptr; |
263 | 0 | } |
264 | 0 | ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; |
265 | 0 | model->devices.clear(); |
266 | 0 | model->devices.push_back(main_gpu); |
267 | 0 | } |
268 | 0 | } |
269 | | |
270 | 0 | for (auto * dev : model->devices) { |
271 | 0 | ggml_backend_dev_props props; |
272 | 0 | ggml_backend_dev_get_props(dev, &props); |
273 | 0 | LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__, |
274 | 0 | ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), |
275 | 0 | props.device_id ? props.device_id : "unknown id", |
276 | 0 | props.memory_free/1024/1024); |
277 | 0 | } |
278 | |
|
279 | 0 | const int status = llama_model_load(path_model, splits, *model, params); |
280 | 0 | GGML_ASSERT(status <= 0); |
281 | 0 | if (status < 0) { |
282 | 0 | if (status == -1) { |
283 | 0 | LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); |
284 | 0 | } else if (status == -2) { |
285 | 0 | LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); |
286 | 0 | } |
287 | |
|
288 | 0 | llama_model_free(model); |
289 | 0 | return nullptr; |
290 | 0 | } |
291 | | |
292 | 0 | return model; |
293 | 0 | } |
294 | | |
295 | | // deprecated |
296 | | struct llama_model * llama_load_model_from_file( |
297 | | const char * path_model, |
298 | 0 | struct llama_model_params params) { |
299 | 0 | return llama_model_load_from_file(path_model, params); |
300 | 0 | } |
301 | | |
302 | | struct llama_model * llama_model_load_from_file( |
303 | | const char * path_model, |
304 | 0 | struct llama_model_params params) { |
305 | 0 | std::vector<std::string> splits = {}; |
306 | 0 | return llama_model_load_from_file_impl(path_model, splits, params); |
307 | 0 | } |
308 | | |
309 | | struct llama_model * llama_model_load_from_splits( |
310 | | const char ** paths, |
311 | | size_t n_paths, |
312 | 0 | struct llama_model_params params) { |
313 | 0 | std::vector<std::string> splits; |
314 | 0 | if (n_paths == 0) { |
315 | 0 | LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); |
316 | 0 | return nullptr; |
317 | 0 | } |
318 | 0 | splits.reserve(n_paths); |
319 | 0 | for (size_t i = 0; i < n_paths; ++i) { |
320 | 0 | splits.push_back(paths[i]); |
321 | 0 | } |
322 | 0 | return llama_model_load_from_file_impl(splits.front(), splits, params); |
323 | 0 | } |
324 | | |
325 | 0 | void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { |
326 | 0 | llama_model_saver ms(*model); |
327 | 0 | ms.add_kv_from_model(); |
328 | 0 | ms.add_tensors_from_model(); |
329 | 0 | ms.save(path_model); |
330 | 0 | } |
331 | | |
332 | | // |
333 | | // chat templates |
334 | | // |
335 | | |
336 | | int32_t llama_chat_apply_template( |
337 | | const char * tmpl, |
338 | | const struct llama_chat_message * chat, |
339 | | size_t n_msg, |
340 | | bool add_ass, |
341 | | char * buf, |
342 | 1.51k | int32_t length) { |
343 | 1.51k | const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl); |
344 | | |
345 | | // format the chat to string |
346 | 1.51k | std::vector<const llama_chat_message *> chat_vec; |
347 | 1.51k | chat_vec.resize(n_msg); |
348 | 10.5k | for (size_t i = 0; i < n_msg; i++) { |
349 | 9.07k | chat_vec[i] = &chat[i]; |
350 | 9.07k | } |
351 | | |
352 | 1.51k | std::string formatted_chat; |
353 | 1.51k | llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); |
354 | 1.51k | if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { |
355 | 595 | return -1; |
356 | 595 | } |
357 | 917 | int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); |
358 | 917 | if (res < 0) { |
359 | 0 | return res; |
360 | 0 | } |
361 | 917 | if (buf && length > 0) { |
362 | 917 | strncpy(buf, formatted_chat.c_str(), length); |
363 | 917 | } |
364 | 917 | return res; |
365 | 917 | } |
366 | | |
367 | | // |
368 | | // model split |
369 | | // |
370 | | |
371 | 0 | int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { |
372 | 0 | static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; |
373 | 0 | if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { |
374 | 0 | return strlen(split_path); |
375 | 0 | } |
376 | 0 | return 0; |
377 | 0 | } |
378 | | |
379 | 0 | int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { |
380 | 0 | std::string str_split_path(split_path); |
381 | 0 | char postfix[32]; |
382 | 0 | snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); |
383 | 0 | std::string str_postfix(postfix); |
384 | | |
385 | | // check if split_prefix ends with postfix |
386 | 0 | int size_prefix = str_split_path.size() - str_postfix.size(); |
387 | 0 | if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { |
388 | 0 | snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); |
389 | 0 | return size_prefix; |
390 | 0 | } |
391 | | |
392 | 0 | return 0; |
393 | 0 | } |
394 | | |
395 | 0 | const char * llama_print_system_info(void) { |
396 | 0 | static std::string s; |
397 | 0 | s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. |
398 | |
|
399 | 0 | for (size_t i = 0; i < ggml_backend_reg_count(); i++) { |
400 | 0 | auto * reg = ggml_backend_reg_get(i); |
401 | 0 | auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); |
402 | 0 | if (get_features_fn) { |
403 | 0 | ggml_backend_feature * features = get_features_fn(reg); |
404 | 0 | s += ggml_backend_reg_name(reg); |
405 | 0 | s += " : "; |
406 | 0 | for (; features->name; features++) { |
407 | 0 | s += features->name; |
408 | 0 | s += " = "; |
409 | 0 | s += features->value; |
410 | 0 | s += " | "; |
411 | 0 | } |
412 | 0 | } |
413 | 0 | } |
414 | |
|
415 | 0 | return s.c_str(); |
416 | 0 | } |
417 | | |