Coverage Report

Created: 2026-03-21 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/common.h
Line
Count
Source
1
// Various helper functions and utilities
2
3
#pragma once
4
5
#include "ggml-opt.h"
6
#include "ggml.h"
7
#include "llama-cpp.h"
8
9
#include <set>
10
#include <sstream>
11
#include <string>
12
#include <string_view>
13
#include <variant>
14
#include <vector>
15
#include <map>
16
17
#if defined(_WIN32) && !defined(_WIN32_WINNT)
18
#define _WIN32_WINNT 0x0A00
19
#endif
20
21
#ifdef _WIN32
22
#define DIRECTORY_SEPARATOR '\\'
23
#else
24
0
#define DIRECTORY_SEPARATOR '/'
25
#endif // _WIN32
26
27
#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
28
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
29
30
#define print_build_info() do {                                                                     \
31
    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
32
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
33
} while(0)
34
35
struct common_time_meas {
36
    common_time_meas(int64_t & t_acc, bool disable = false);
37
    ~common_time_meas();
38
39
    const int64_t t_start_us;
40
41
    int64_t & t_acc;
42
};
43
44
struct common_adapter_lora_info {
45
    std::string path;
46
    float scale;
47
48
    std::string task_name;
49
    std::string prompt_prefix;
50
51
    struct llama_adapter_lora * ptr;
52
};
53
54
using llama_tokens = std::vector<llama_token>;
55
56
// build info
57
extern int LLAMA_BUILD_NUMBER;
58
extern const char * LLAMA_COMMIT;
59
extern const char * LLAMA_COMPILER;
60
extern const char * LLAMA_BUILD_TARGET;
61
62
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
63
64
struct common_control_vector_load_info;
65
66
//
67
// CPU utils
68
//
69
70
struct cpu_params {
71
    int      n_threads                   = -1;
72
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
73
    bool     mask_valid                  = false;   // Default: any CPU
74
    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
75
    bool     strict_cpu                  = false;   // Use strict CPU placement
76
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
77
};
78
79
int32_t cpu_get_num_physical_cores();
80
int32_t cpu_get_num_math();
81
82
//
83
// Common params
84
//
85
86
enum llama_example {
87
    LLAMA_EXAMPLE_BATCHED,
88
    LLAMA_EXAMPLE_DEBUG,
89
    LLAMA_EXAMPLE_COMMON,
90
    LLAMA_EXAMPLE_SPECULATIVE,
91
    LLAMA_EXAMPLE_COMPLETION,
92
    LLAMA_EXAMPLE_CLI,
93
    LLAMA_EXAMPLE_EMBEDDING,
94
    LLAMA_EXAMPLE_PERPLEXITY,
95
    LLAMA_EXAMPLE_RETRIEVAL,
96
    LLAMA_EXAMPLE_PASSKEY,
97
    LLAMA_EXAMPLE_IMATRIX,
98
    LLAMA_EXAMPLE_BENCH,
99
    LLAMA_EXAMPLE_SERVER,
100
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
101
    LLAMA_EXAMPLE_EXPORT_LORA,
102
    LLAMA_EXAMPLE_MTMD,
103
    LLAMA_EXAMPLE_LOOKUP,
104
    LLAMA_EXAMPLE_PARALLEL,
105
    LLAMA_EXAMPLE_TTS,
106
    LLAMA_EXAMPLE_DIFFUSION,
107
    LLAMA_EXAMPLE_FINETUNE,
108
    LLAMA_EXAMPLE_FIT_PARAMS,
109
    LLAMA_EXAMPLE_RESULTS,
110
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
111
112
    LLAMA_EXAMPLE_COUNT,
113
};
114
115
enum common_sampler_type {
116
    COMMON_SAMPLER_TYPE_NONE        = 0,
117
    COMMON_SAMPLER_TYPE_DRY         = 1,
118
    COMMON_SAMPLER_TYPE_TOP_K       = 2,
119
    COMMON_SAMPLER_TYPE_TOP_P       = 3,
120
    COMMON_SAMPLER_TYPE_MIN_P       = 4,
121
  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
122
    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
123
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
124
    COMMON_SAMPLER_TYPE_XTC         = 8,
125
    COMMON_SAMPLER_TYPE_INFILL      = 9,
126
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
127
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
128
    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
129
};
130
131
// dimensionality reduction methods, used by cvector-generator
132
enum dimre_method {
133
    DIMRE_METHOD_PCA,
134
    DIMRE_METHOD_MEAN,
135
};
136
137
enum common_conversation_mode {
138
    COMMON_CONVERSATION_MODE_DISABLED = 0,
139
    COMMON_CONVERSATION_MODE_ENABLED  = 1,
140
    COMMON_CONVERSATION_MODE_AUTO     = 2,
141
};
142
143
enum common_grammar_trigger_type {
144
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
145
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
146
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
147
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
148
};
149
150
struct common_grammar_trigger {
151
    common_grammar_trigger_type type;
152
    std::string value;
153
    llama_token token = LLAMA_TOKEN_NULL;
154
};
155
156
enum common_params_sampling_config : uint64_t {
157
    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
158
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
159
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
160
    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
161
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
162
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
163
    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
164
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
165
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
166
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
167
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
168
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
169
};
170
171
enum common_speculative_type {
172
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
173
    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
174
    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
175
    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
176
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
177
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
178
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
179
    COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
180
    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
181
};
182
183
// Grammar type enumeration
184
enum common_grammar_type {
185
    COMMON_GRAMMAR_TYPE_NONE,           // no grammar set
186
    COMMON_GRAMMAR_TYPE_USER,           // user-provided GBNF (--grammar / "grammar" API field)
187
    COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT,  // auto-generated from JSON schema (--json-schema / "json_schema" API field)
188
    COMMON_GRAMMAR_TYPE_TOOL_CALLS,     // auto-generated by chat template parser for function calling
189
};
190
191
// Grammar variant struct with type and grammar string
192
struct common_grammar {
193
    common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
194
    std::string grammar;
195
196
    // Default constructor - no grammar
197
    common_grammar() = default;
198
199
    // Constructor with type and grammar string
200
0
    common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
201
0
        GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
202
0
    }
203
204
    // Check if a grammar is set
205
0
    bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
206
};
207
208
// Returns the raw grammar string, or empty string if no grammar is set.
209
0
inline const std::string & common_grammar_value(const common_grammar & g) {
210
0
    return g.grammar;
211
0
}
212
213
// Returns true when the generation_prompt should be prefilled into the grammar sampler.
214
// Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
215
0
inline bool common_grammar_needs_prefill(const common_grammar & g) {
216
0
    return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
217
0
        || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
218
0
}
219
220
// sampling parameters
221
struct common_params_sampling {
222
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
223
224
    int32_t n_prev             = 64;     // number of previous tokens to remember
225
    int32_t n_probs            = 0;      // if greater than 0, output the probabilities of top n_probs tokens.
226
    int32_t min_keep           = 0;      // 0 = disabled, otherwise samplers should return at least min_keep tokens
227
    int32_t top_k              = 40;     // <= 0 to use vocab size
228
    float   top_p              = 0.95f;  // 1.0 = disabled
229
    float   min_p              = 0.05f;  // 0.0 = disabled
230
    float   xtc_probability    = 0.00f;  // 0.0 = disabled
231
    float   xtc_threshold      = 0.10f;  // > 0.5 disables XTC
232
    float   typ_p              = 1.00f;  // typical_p, 1.0 = disabled
233
    float   temp               = 0.80f;  // <= 0.0 to sample greedily, 0.0 to not output probabilities
234
    float   dynatemp_range     = 0.00f;  // 0.0 = disabled
235
    float   dynatemp_exponent  = 1.00f;  // controls how entropy maps to temperature in dynamic temperature sampler
236
    int32_t penalty_last_n     = 64;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
237
    float   penalty_repeat     = 1.00f;  // 1.0 = disabled
238
    float   penalty_freq       = 0.00f;  // 0.0 = disabled
239
    float   penalty_present    = 0.00f;  // 0.0 = disabled
240
    float   dry_multiplier     = 0.0f;   // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
241
    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
242
    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
243
    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
244
    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
245
    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
246
    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
247
    float   top_n_sigma        = -1.00f; // -1.0 = disabled
248
    float   mirostat_tau       = 5.00f;  // target entropy
249
    float   mirostat_eta       = 0.10f;  // learning rate
250
    bool    ignore_eos         = false;
251
    bool    no_perf            = false;  // disable performance metrics
252
    bool    timing_per_token   = false;
253
254
    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
255
256
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
257
258
    std::vector<enum common_sampler_type> samplers = {
259
        COMMON_SAMPLER_TYPE_PENALTIES,
260
        COMMON_SAMPLER_TYPE_DRY,
261
        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
262
        COMMON_SAMPLER_TYPE_TOP_K,
263
        COMMON_SAMPLER_TYPE_TYPICAL_P,
264
        COMMON_SAMPLER_TYPE_TOP_P,
265
        COMMON_SAMPLER_TYPE_MIN_P,
266
        COMMON_SAMPLER_TYPE_XTC,
267
        COMMON_SAMPLER_TYPE_TEMPERATURE,
268
    };
269
270
    common_grammar              grammar;      // optional grammar constraint (user / output-format / tool-calls)
271
    bool                                grammar_lazy = false;
272
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
273
    std::set<llama_token>               preserved_tokens;
274
275
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
276
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
277
278
    // The assistant generation prompt already prefilled into the prompt.
279
    // Fed to the grammar sampler (to advance past pre-existing tokens) and used
280
    // to determine the reasoning budget sampler's initial state.
281
    // Only applied when the grammar is of output-format or tool-calls type.
282
    std::string generation_prompt;
283
284
    // reasoning budget sampler parameters
285
    // these are populated by the server/CLI based on chat template params
286
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
287
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
288
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
289
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
290
291
    bool backend_sampling = false;
292
293
0
    bool has_logit_bias() const {
294
0
        return !logit_bias.empty();
295
0
    }
296
297
    // print the parameters into a string
298
    std::string print() const;
299
};
300
301
struct common_params_model {
302
    std::string path        = ""; // model local path                                       // NOLINT
303
    std::string url         = ""; // model url to download                                  // NOLINT
304
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
305
    std::string hf_file     = ""; // HF file                                                // NOLINT
306
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
307
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
308
};
309
310
struct common_ngram_mod;
311
312
struct common_params_speculative {
313
    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
314
315
    // general-purpose speculative decoding parameters
316
317
    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
318
    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
319
    float   p_split = 0.1f; // speculative decoding split probability
320
    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
321
322
    // ngram-based speculative decoding
323
324
    uint16_t ngram_size_n     = 12; // ngram size for lookup
325
    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
326
    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
327
328
    std::shared_ptr<common_ngram_mod> ngram_mod;
329
330
    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
331
    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
332
333
    // draft-model speculative decoding
334
335
    struct common_params_model mparams_dft;
336
337
    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
338
339
    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
340
341
    int32_t n_ctx        = 0;  // draft context size
342
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
343
344
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
345
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
346
347
    struct cpu_params cpuparams;
348
    struct cpu_params cpuparams_batch;
349
350
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
351
352
    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
353
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
354
355
0
    bool has_dft() const {
356
0
        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
357
0
    }
358
};
359
360
struct common_params_vocoder {
361
    struct common_params_model model;
362
363
    std::string speaker_file = ""; // speaker file path                                      // NOLINT
364
365
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
366
};
367
368
struct common_params_diffusion {
369
    int32_t steps         = 128;
370
    bool    visual_mode   = false;
371
372
    float   eps           = 0;        // epsilon for timesteps
373
    int32_t block_length  = 0;        // block length for generation
374
375
    int32_t algorithm     = 4;        // default algorithm: low-confidence
376
    float   alg_temp      = 0.0f;     // algorithm temperature
377
378
    float   cfg_scale     = 0;        // classifier-free guidance scale
379
    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
380
};
381
382
// reasoning API response format (not to be confused as chat template's reasoning format)
383
// only used by server
384
enum common_reasoning_format {
385
    COMMON_REASONING_FORMAT_NONE,
386
    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
387
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
388
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
389
    // do not extend this enum unless you absolutely have to
390
    // in most cases, use COMMON_REASONING_FORMAT_AUTO
391
    // see: https://github.com/ggml-org/llama.cpp/pull/15408
392
};
393
394
395
struct lr_opt {
396
    float    lr0          = 1e-5; // learning rate at first epoch
397
    float    lr_min       = -1;
398
    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
399
    float    scale_epoch  = 0;
400
    float    wd           = 0;
401
    unsigned epochs       = 2;
402
403
    unsigned epoch; // set by optimizer outer (epochs) loop
404
    // learning rate decay - constant LR per epoch only for now
405
    float get_lr(float e) const;
406
0
    float get_lr() const { return get_lr(epoch); }
407
    // must call after arg parse, before get_lr
408
    void init();
409
};
410
411
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
412
413
struct common_params {
414
    int32_t n_predict             =    -1; // max. number of new tokens to predict, -1 == no limit
415
    int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
416
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
417
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
418
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
419
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
420
    int32_t n_parallel            =     1; // number of parallel sequences to decode
421
    int32_t n_sequences           =     1; // number of sequences to decode
422
    int32_t grp_attn_n            =     1; // group-attention factor
423
    int32_t grp_attn_w            =   512; // group-attention width
424
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
425
    float   rope_freq_base        =  0.0f; // RoPE base frequency
426
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
427
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
428
    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
429
    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
430
    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
431
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
432
433
    // offload params
434
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
435
436
    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
437
    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
438
    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
439
    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
440
    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
441
442
    // margin per device in bytes for fitting parameters to free memory:
443
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
444
445
    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
446
447
    struct cpu_params cpuparams;
448
    struct cpu_params cpuparams_batch;
449
450
    ggml_backend_sched_eval_callback cb_eval = nullptr;
451
    void * cb_eval_user_data                 = nullptr;
452
453
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
454
455
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
456
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
457
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
458
    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
459
460
    struct common_params_sampling    sampling;
461
    struct common_params_speculative speculative;
462
    struct common_params_vocoder     vocoder;
463
    struct common_params_diffusion   diffusion;
464
465
    struct common_params_model model;
466
467
    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
468
    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
469
    std::string hf_token             = ""; // HF token                                                      // NOLINT
470
    std::string prompt               = "";                                                                  // NOLINT
471
    std::string system_prompt        = "";                                                                  // NOLINT
472
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
473
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
474
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
475
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
476
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
477
478
    // llama-debug specific options
479
    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
480
    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
481
    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
482
483
    std::vector<std::string> in_files;   // all input files
484
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
485
    std::vector<llama_model_kv_override> kv_overrides;
486
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
487
488
    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
489
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
490
491
    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
492
493
    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
494
    int32_t control_vector_layer_start = -1; // layer range for control vector
495
    int32_t control_vector_layer_end   = -1; // layer range for control vector
496
    bool    offline                    = false;
497
498
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
499
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
500
                                     //                                       (which is more convenient to use for plotting)
501
                                     //
502
    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
503
    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
504
505
    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
506
    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
507
508
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
509
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
510
511
    bool   kl_divergence    = false; // compute KL divergence
512
513
    bool check             = false; // check rather than generate results for llama-results
514
515
    bool usage             = false; // print usage
516
    bool completion        = false; // print source-able completion script
517
    bool use_color         = false; // use color to distinguish generations and inputs
518
    bool special           = false; // enable special token output
519
    bool interactive       = false; // interactive mode
520
    bool interactive_first = false; // wait for user input immediately
521
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
522
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
523
524
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
525
    bool multiline_input   = false; // reverse the usage of `\`
526
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
527
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
528
    bool no_perf           = false; // disable performance metrics
529
    bool show_timings      = true;  // show timing information on CLI
530
    bool ctx_shift         = false; // context shift on infinite text generation
531
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
532
    bool kv_unified        = false; // enable unified KV cache
533
534
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
535
    bool use_mmap          = true;  // enable mmap to use filesystem cache
536
    bool use_direct_io     = false; // read from disk without buffering
537
    bool use_mlock         = false; // use mlock to keep model in memory
538
    bool verbose_prompt    = false; // print prompt tokens before generation
539
    bool display_prompt    = true;  // print prompt before generation
540
    bool no_kv_offload     = false; // disable KV offloading
541
    bool warmup            = true;  // warmup run
542
    bool check_tensors     = false; // validate tensor data
543
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
544
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
545
    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
546
547
    bool single_turn       = false; // single turn chat conversation
548
549
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
550
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
551
552
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
553
554
    // multimodal models (see tools/mtmd)
555
    struct common_params_model mmproj;
556
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
557
    bool no_mmproj = false;         // explicitly disable multimodal model
558
    std::vector<std::string> image; // path to image file(s)
559
    int image_min_tokens = -1;
560
    int image_max_tokens = -1;
561
562
    // finetune
563
    struct lr_opt lr;
564
    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
565
    float val_split = 0.05f; // fraction of the data used for the validation set
566
567
    // embedding
568
    bool embedding         = false; // get only sentence embedding
569
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
570
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
571
    std::string embd_sep   = "\n";  // separator of embeddings
572
    std::string cls_sep    = "\t";  // separator of classification sequences
573
574
    // server params
575
    int32_t port                = 8080;          // server listens on this network port
576
    int32_t timeout_read        = 600;           // http read timeout in seconds
577
    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
578
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
579
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
580
    bool    cache_prompt        = true;  // whether to enable prompt caching
581
    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
582
    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
583
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
584
585
    std::string hostname      = "127.0.0.1";
586
    std::string public_path   = "";                                                                         // NOLINT
587
    std::string api_prefix    = "";                                                                         // NOLINT
588
    std::string chat_template = "";                                                                         // NOLINT
589
    bool use_jinja = true;                                                                                  // NOLINT
590
    bool enable_chat_template = true;
591
    bool force_pure_content_parser = false;
592
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
593
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
594
    int reasoning_budget = -1;
595
    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
596
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
597
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
598
599
    std::vector<std::string> api_keys;
600
601
    std::string ssl_file_key  = "";                                                                         // NOLINT
602
    std::string ssl_file_cert = "";                                                                         // NOLINT
603
604
    std::map<std::string, std::string> default_template_kwargs;
605
606
    // webui configs
607
    bool webui = true;
608
    bool webui_mcp_proxy = false;
609
    std::string webui_config_json;
610
611
    // "advanced" endpoints are disabled by default for better security
612
    bool endpoint_slots   = true;
613
    bool endpoint_props   = false; // only control POST requests, not GET
614
    bool endpoint_metrics = false;
615
616
    // router server configs
617
    std::string models_dir    = ""; // directory containing models for the router server
618
    std::string models_preset = ""; // directory containing model presets for the router server
619
    int models_max = 4;             // maximum number of models to load simultaneously
620
    bool models_autoload = true;    // automatically load models when requested via the router server
621
622
    bool log_json = false;
623
624
    std::string slot_save_path;
625
    std::string media_path; // path to directory for loading media files
626
627
    float slot_prompt_similarity = 0.1f;
628
629
    // batched-bench params
630
    bool is_pp_shared   = false;
631
    bool is_tg_separate = false;
632
633
    std::vector<int32_t> n_pp;
634
    std::vector<int32_t> n_tg;
635
    std::vector<int32_t> n_pl;
636
637
    // retrieval params
638
    std::vector<std::string> context_files; // context files to embed
639
640
    int32_t chunk_size = 64; // chunk size for context embedding
641
642
    std::string chunk_separator = "\n"; // chunk separator for context embedding
643
644
    // passkey params
645
    int32_t n_junk = 250; // number of times to repeat the junk text
646
    int32_t i_pos  = -1;  // position of the passkey in the junk text
647
648
    // imatrix params
649
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
650
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
651
    int32_t i_chunk     =  0; // start processing from this chunk
652
    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
653
654
    bool process_output  = false; // collect data for the output tensor
655
    bool compute_ppl     = true;  // whether to compute perplexity
656
    bool show_statistics = false; // show imatrix statistics per tensor
657
    bool parse_special   = false; // whether to parse special tokens during imatrix tokenization
658
659
    // cvector-generator params
660
    int n_pca_batch = 100;
661
    int n_pca_iterations = 1000;
662
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
663
    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
664
    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
665
666
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
667
668
    // batched-bench params
669
    bool batched_bench_output_jsonl = false;
670
671
    // common params
672
    std::string out_file; // output filename for all example programs
673
    // optional callback for model loading progress and cancellation:
674
    // called with a progress value between 0.0 and 1.0.
675
    // return false from callback to abort model loading or true to continue
676
    llama_progress_callback load_progress_callback = NULL;
677
    void *                  load_progress_callback_user_data = NULL;
678
};
679
680
// call once at the start of a program if it uses libcommon
681
// initializes the logging system and prints info about the build
682
void common_init();
683
684
std::string common_params_get_system_info(const common_params & params);
685
686
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
687
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
688
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
689
bool set_process_priority(enum ggml_sched_priority prio);
690
691
//
692
// String utils
693
//
694
695
#ifdef __GNUC__
696
#    if defined(__MINGW32__) && !defined(__clang__)
697
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
698
#    else
699
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
700
#    endif
701
#else
702
#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
703
#endif
704
705
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
706
std::string string_format(const char * fmt, ...);
707
708
std::string string_strip(const std::string & str);
709
std::string string_get_sortable_timestamp();
710
711
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
712
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
713
std::string string_repeat(const std::string & str, size_t n);
714
715
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
716
717
std::string regex_escape(const std::string & s);
718
719
template<class T>
720
static std::vector<T> string_split(const std::string & str, char delim) {
721
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
722
    std::vector<T> values;
723
    std::istringstream str_stream(str);
724
    std::string token;
725
    while (std::getline(str_stream, token, delim)) {
726
        T value;
727
        std::istringstream token_stream(token);
728
        token_stream >> value;
729
        values.push_back(value);
730
    }
731
    return values;
732
}
733
734
template<>
735
inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
736
0
{
737
0
    std::vector<std::string> parts;
738
0
    size_t begin_pos = 0;
739
0
    size_t delim_pos = str.find(delim);
740
0
    while (delim_pos != std::string::npos) {
741
0
        std::string part = str.substr(begin_pos, delim_pos - begin_pos);
742
0
        parts.emplace_back(part);
743
0
        begin_pos = delim_pos + 1;
744
0
        delim_pos = str.find(delim, begin_pos);
745
0
    }
746
0
    parts.emplace_back(str.substr(begin_pos));
747
0
    return parts;
748
0
}
Unexecuted instantiation: json-schema-to-grammar.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: common.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: log.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: sampling.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: reasoning-budget.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
749
750
// remove when moving to c++20
751
0
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
752
0
    return str.size() >= prefix.size() &&
753
0
           str.compare(0, prefix.size(), prefix) == 0;
754
0
}
755
756
// remove when moving to c++20
757
0
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
758
0
    return str.size() >= suffix.size() &&
759
0
           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
760
0
}
761
762
0
inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
763
0
    if (string_ends_with(str, suffix)) {
764
0
        str.resize(str.size() - suffix.size());
765
0
        return true;
766
0
    }
767
0
    return false;
768
0
}
769
770
0
inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
771
0
    if (!str.empty() && !stop.empty()) {
772
0
        const size_t max_len = std::min(str.size(), stop.size());
773
0
        const char last_char = str.back();
774
0
        for (size_t len = max_len; len > 0; --len) {
775
0
            if (stop[len - 1] == last_char) {
776
0
                if (string_ends_with(str, stop.substr(0, len))) {
777
0
                    return str.size() - len;
778
0
                }
779
0
            }
780
0
        }
781
0
    }
782
0
    return std::string::npos;
783
0
}
784
785
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
786
void string_process_escapes(std::string & input);
787
788
std::string string_from(bool value);
789
std::string string_from(const std::vector<int> & values);
790
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
791
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
792
793
//
794
// Filesystem utils
795
//
796
797
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
798
bool fs_create_directory_with_parents(const std::string & path);
799
bool fs_is_directory(const std::string & path);
800
801
std::string fs_get_cache_directory();
802
std::string fs_get_cache_file(const std::string & filename);
803
804
struct common_file_info {
805
    std::string path;
806
    std::string name;
807
    size_t      size = 0; // in bytes
808
    bool        is_dir = false;
809
};
810
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
811
812
//
813
// TTY utils
814
//
815
816
// Auto-detect if colors can be enabled based on terminal and environment
817
bool tty_can_use_colors();
818
819
//
820
// Model utils
821
//
822
823
struct common_sampler;
824
825
// note: defines the model, context, samplers, ets. lifetimes
826
struct common_init_result {
827
    common_init_result(common_params & params);
828
    ~common_init_result();
829
830
    llama_model * model();
831
    llama_context * context();
832
833
    common_sampler * sampler(llama_seq_id seq_id);
834
    void reset_samplers();
835
836
    std::vector<llama_adapter_lora_ptr> & lora();
837
838
private:
839
    struct impl;
840
    std::unique_ptr<impl> pimpl;
841
};
842
843
using common_init_result_ptr = std::unique_ptr<common_init_result>;
844
845
common_init_result_ptr common_init_from_params(common_params & params);
846
847
struct llama_model_params     common_model_params_to_llama  (      common_params & params);
848
struct llama_context_params   common_context_params_to_llama(const common_params & params);
849
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
850
851
// clear LoRA adapters from context, then apply new list of adapters
852
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
853
854
std::string                   get_model_endpoint();
855
856
//
857
// Batch utils
858
//
859
860
void common_batch_clear(struct llama_batch & batch);
861
862
void common_batch_add(
863
                 struct llama_batch & batch,
864
                        llama_token   id,
865
                          llama_pos   pos,
866
    const std::vector<llama_seq_id> & seq_ids,
867
                               bool   logits);
868
869
// decodes a single batch of tokens for a prompt and manages session tokens
870
//
871
// Note: We save state before the last token so that we can replay it to ensure
872
// compatibility with all memory types. Recurrent/hybrid models cannot remove
873
// tokens from memory, so this approach works across all model architectures.
874
bool common_prompt_batch_decode(
875
              struct llama_context * ctx,
876
    const std::vector<llama_token> & embd,
877
                               int & n_past,
878
                               int   n_batch,
879
                  std::string_view   state_path,
880
                              bool   save_state);
881
882
// replays the last token after loading state to regenerate logits
883
// used after loading session state to ensure the sampling context has valid logits
884
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
885
886
//
887
// Vocab utils
888
//
889
890
// tokenizes a string into a vector of tokens
891
// should work similar to Python's `tokenizer.encode`
892
std::vector<llama_token> common_tokenize(
893
  const struct llama_context * ctx,
894
           const std::string & text,
895
                        bool   add_special,
896
                        bool   parse_special = false);
897
898
std::vector<llama_token> common_tokenize(
899
    const struct llama_vocab * vocab,
900
           const std::string & text,
901
                        bool   add_special,
902
                        bool   parse_special = false);
903
904
// tokenizes a token into a piece, optionally renders special/control tokens
905
// should work similar to Python's `tokenizer.id_to_piece`
906
std::string common_token_to_piece(
907
        const struct llama_context * ctx,
908
                       llama_token   token,
909
                       bool          special = true);
910
911
std::string common_token_to_piece(
912
          const struct llama_vocab * vocab,
913
                       llama_token   token,
914
                       bool          special = true);
915
916
// detokenizes a vector of tokens into a string
917
// should work similar to Python's `tokenizer.decode`
918
// optionally renders special/control tokens
919
std::string common_detokenize(
920
            const struct llama_context * ctx,
921
        const std::vector<llama_token> & tokens,
922
                                  bool   special = true);
923
924
std::string common_detokenize(
925
              const struct llama_vocab * vocab,
926
        const std::vector<llama_token> & tokens,
927
                                  bool   special = true);
928
929
//
930
// Embedding utils
931
//
932
933
// TODO: replace embd_norm with an enum
934
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
935
936
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
937
938
//
939
// Control vector utils
940
//
941
942
struct common_control_vector_data {
943
    int n_embd;
944
945
    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
946
    std::vector<float> data;
947
};
948
949
struct common_control_vector_load_info {
950
    float strength;
951
952
    std::string fname;
953
};
954
955
// Load control vectors, scale each by strength, and add them together.
956
// On error, returns {-1, empty}
957
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
958
959
//
960
// Split utils
961
//
962
963
namespace {
964
965
const char * const LLM_KV_SPLIT_NO            = "split.no";
966
const char * const LLM_KV_SPLIT_COUNT         = "split.count";
967
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
968
969
}
970
971
//
972
// MoE utils
973
//
974
975
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
976
977
0
inline std::string llm_ffn_exps_block_regex(int idx) {
978
0
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
979
0
}
980
981
0
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
982
0
    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
983
0
}
984
985
//
986
// training utils
987
//
988
989
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
990
991
// "adamw" or "sgd" (case insensitive)
992
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);