Coverage Report

Created: 2026-06-13 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/common.h
Line
Count
Source
1
// Various helper functions and utilities
2
3
#pragma once
4
5
#include "llama-cpp.h"
6
7
#include "ggml-opt.h"
8
#include "ggml.h"
9
10
#include <set>
11
#include <sstream>
12
#include <string>
13
#include <string_view>
14
#include <vector>
15
#include <map>
16
#include <algorithm>
17
18
#if defined(_WIN32) && !defined(_WIN32_WINNT)
19
#define _WIN32_WINNT 0x0A00
20
#endif
21
22
#ifdef _WIN32
23
#define DIRECTORY_SEPARATOR '\\'
24
#else
25
0
#define DIRECTORY_SEPARATOR '/'
26
#endif // _WIN32
27
28
#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
29
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
30
31
struct common_time_meas {
32
    common_time_meas(int64_t & t_acc, bool disable = false);
33
    ~common_time_meas();
34
35
    const int64_t t_start_us;
36
37
    int64_t & t_acc;
38
};
39
40
struct common_adapter_lora_info {
41
    std::string path;
42
    float scale;
43
44
    std::string task_name;
45
    std::string prompt_prefix;
46
47
    struct llama_adapter_lora * ptr;
48
};
49
50
using llama_tokens = std::vector<llama_token>;
51
52
struct common_control_vector_load_info;
53
54
//
55
// CPU utils
56
//
57
58
struct common_cpu_params {
59
    int      n_threads                   = -1;
60
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
61
    bool     mask_valid                  = false;   // Default: any CPU
62
    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
63
    bool     strict_cpu                  = false;   // Use strict CPU placement
64
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
65
};
66
67
int32_t common_cpu_get_num_physical_cores();
68
int32_t common_cpu_get_num_math();
69
70
//
71
// Common params
72
//
73
74
enum llama_example {
75
    LLAMA_EXAMPLE_BATCHED,
76
    LLAMA_EXAMPLE_DEBUG,
77
    LLAMA_EXAMPLE_COMMON,
78
    LLAMA_EXAMPLE_SPECULATIVE,
79
    LLAMA_EXAMPLE_COMPLETION,
80
    LLAMA_EXAMPLE_CLI,
81
    LLAMA_EXAMPLE_EMBEDDING,
82
    LLAMA_EXAMPLE_PERPLEXITY,
83
    LLAMA_EXAMPLE_RETRIEVAL,
84
    LLAMA_EXAMPLE_PASSKEY,
85
    LLAMA_EXAMPLE_IMATRIX,
86
    LLAMA_EXAMPLE_BENCH,
87
    LLAMA_EXAMPLE_SERVER,
88
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
    LLAMA_EXAMPLE_EXPORT_LORA,
90
    LLAMA_EXAMPLE_MTMD,
91
    LLAMA_EXAMPLE_LOOKUP,
92
    LLAMA_EXAMPLE_PARALLEL,
93
    LLAMA_EXAMPLE_TTS,
94
    LLAMA_EXAMPLE_DIFFUSION,
95
    LLAMA_EXAMPLE_FINETUNE,
96
    LLAMA_EXAMPLE_FIT_PARAMS,
97
    LLAMA_EXAMPLE_RESULTS,
98
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
99
100
    LLAMA_EXAMPLE_COUNT,
101
};
102
103
enum common_sampler_type {
104
    COMMON_SAMPLER_TYPE_NONE        = 0,
105
    COMMON_SAMPLER_TYPE_DRY         = 1,
106
    COMMON_SAMPLER_TYPE_TOP_K       = 2,
107
    COMMON_SAMPLER_TYPE_TOP_P       = 3,
108
    COMMON_SAMPLER_TYPE_MIN_P       = 4,
109
  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
110
    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
111
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
112
    COMMON_SAMPLER_TYPE_XTC         = 8,
113
    COMMON_SAMPLER_TYPE_INFILL      = 9,
114
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
115
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
116
    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
117
};
118
119
// dimensionality reduction methods, used by cvector-generator
120
enum dimre_method {
121
    DIMRE_METHOD_PCA,
122
    DIMRE_METHOD_MEAN,
123
};
124
125
enum common_conversation_mode {
126
    COMMON_CONVERSATION_MODE_DISABLED = 0,
127
    COMMON_CONVERSATION_MODE_ENABLED  = 1,
128
    COMMON_CONVERSATION_MODE_AUTO     = 2,
129
};
130
131
enum common_grammar_trigger_type {
132
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
133
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
134
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
135
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
136
};
137
138
struct common_grammar_trigger {
139
    common_grammar_trigger_type type;
140
    std::string value;
141
    llama_token token = LLAMA_TOKEN_NULL;
142
};
143
144
enum common_params_sampling_config : uint64_t {
145
    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
146
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
147
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
148
    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
149
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
150
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
151
    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
152
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
153
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
154
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
155
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
156
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
157
};
158
159
enum common_speculative_type {
160
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
161
    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
162
    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
163
    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
164
    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
165
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
166
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
167
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
168
    COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
169
    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
170
};
171
172
// Grammar type enumeration
173
enum common_grammar_type {
174
    COMMON_GRAMMAR_TYPE_NONE,           // no grammar set
175
    COMMON_GRAMMAR_TYPE_USER,           // user-provided GBNF (--grammar / "grammar" API field)
176
    COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT,  // auto-generated from JSON schema (--json-schema / "json_schema" API field)
177
    COMMON_GRAMMAR_TYPE_TOOL_CALLS,     // auto-generated by chat template parser for function calling
178
};
179
180
// Grammar variant struct with type and grammar string
181
struct common_grammar {
182
    common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
183
    std::string grammar;
184
185
    // Default constructor - no grammar
186
    common_grammar() = default;
187
188
    // Constructor with type and grammar string
189
0
    common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
190
0
        GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
191
0
    }
192
193
    // Check if a grammar is set
194
0
    bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
195
};
196
197
// Returns the raw grammar string, or empty string if no grammar is set.
198
0
inline const std::string & common_grammar_value(const common_grammar & g) {
199
0
    return g.grammar;
200
0
}
201
202
// Returns true when the generation_prompt should be prefilled into the grammar sampler.
203
// Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
204
0
inline bool common_grammar_needs_prefill(const common_grammar & g) {
205
0
    return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
206
0
        || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
207
0
}
208
209
// sampling parameters
210
struct common_params_sampling {
211
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
212
213
    int32_t n_prev             = 64;     // number of previous tokens to remember
214
    int32_t n_probs            = 0;      // if greater than 0, output the probabilities of top n_probs tokens.
215
    int32_t min_keep           = 0;      // 0 = disabled, otherwise samplers should return at least min_keep tokens
216
    int32_t top_k              = 40;     // <= 0 to use vocab size
217
    float   top_p              = 0.95f;  // 1.0 = disabled
218
    float   min_p              = 0.05f;  // 0.0 = disabled
219
    float   xtc_probability    = 0.00f;  // 0.0 = disabled
220
    float   xtc_threshold      = 0.10f;  // > 0.5 disables XTC
221
    float   typ_p              = 1.00f;  // typical_p, 1.0 = disabled
222
    float   temp               = 0.80f;  // <= 0.0 to sample greedily, 0.0 to not output probabilities
223
    float   dynatemp_range     = 0.00f;  // 0.0 = disabled
224
    float   dynatemp_exponent  = 1.00f;  // controls how entropy maps to temperature in dynamic temperature sampler
225
    int32_t penalty_last_n     = 64;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
226
    float   penalty_repeat     = 1.00f;  // 1.0 = disabled
227
    float   penalty_freq       = 0.00f;  // 0.0 = disabled
228
    float   penalty_present    = 0.00f;  // 0.0 = disabled
229
    float   dry_multiplier     = 0.0f;   // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
230
    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
231
    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
232
    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
233
    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
234
    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
235
    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
236
    float   top_n_sigma        = -1.00f; // -1.0 = disabled
237
    float   mirostat_tau       = 5.00f;  // target entropy
238
    float   mirostat_eta       = 0.10f;  // learning rate
239
    bool    ignore_eos         = false;
240
    bool    no_perf            = false;  // disable performance metrics
241
    bool    timing_per_token   = false;
242
243
    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
244
245
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
246
247
    std::vector<enum common_sampler_type> samplers = {
248
        COMMON_SAMPLER_TYPE_PENALTIES,
249
        COMMON_SAMPLER_TYPE_DRY,
250
        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
251
        COMMON_SAMPLER_TYPE_TOP_K,
252
        COMMON_SAMPLER_TYPE_TYPICAL_P,
253
        COMMON_SAMPLER_TYPE_TOP_P,
254
        COMMON_SAMPLER_TYPE_MIN_P,
255
        COMMON_SAMPLER_TYPE_XTC,
256
        COMMON_SAMPLER_TYPE_TEMPERATURE,
257
    };
258
259
    common_grammar              grammar;      // optional grammar constraint (user / output-format / tool-calls)
260
    bool                                grammar_lazy = false;
261
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
262
    std::set<llama_token>               preserved_tokens;
263
264
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
265
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
266
267
    // The assistant generation prompt already prefilled into the prompt.
268
    // Fed to the grammar sampler (to advance past pre-existing tokens) and used
269
    // to determine the reasoning budget sampler's initial state.
270
    // Only applied when the grammar is of output-format or tool-calls type.
271
    std::string generation_prompt;
272
273
    // reasoning budget sampler parameters
274
    // these are populated by the server/CLI based on chat template params
275
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
276
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
277
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
278
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
279
    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted
280
    bool                     reasoning_control = false;        // create the budget sampler on demand so reasoning can be ended at runtime
281
282
    bool backend_sampling = false;
283
284
0
    bool has_logit_bias() const {
285
0
        return !logit_bias.empty();
286
0
    }
287
288
    // print the parameters into a string
289
    std::string print() const;
290
};
291
292
struct common_params_model {
293
    std::string path        = ""; // model local path                                       // NOLINT
294
    std::string url         = ""; // model url to download                                  // NOLINT
295
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
296
    std::string hf_file     = ""; // HF file                                                // NOLINT
297
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
298
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
299
};
300
301
// draft-model-based speculative decoding parameters
302
struct common_params_speculative_draft {
303
    int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
304
    int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
305
306
    float p_split = 0.1f; // speculative decoding split probability
307
    float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
308
309
    bool backend_sampling = true; // offload draft sampling to the backend (default: on)
310
311
    common_params_model mparams;
312
313
    llama_context * ctx_tgt = nullptr;
314
    llama_context * ctx_dft = nullptr;
315
316
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
317
318
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
319
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
320
321
    common_cpu_params cpuparams;
322
    common_cpu_params cpuparams_batch;
323
324
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
325
326
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
327
};
328
329
struct common_params_speculative_ngram_mod {
330
    int32_t n_match = 24;
331
332
    int32_t n_max = 64;
333
    int32_t n_min = 48;
334
};
335
336
struct common_params_speculative_ngram_map {
337
    uint16_t size_n   = 12; // ngram size for lookup
338
    uint16_t size_m   = 48; // mgram size for speculative tokens
339
    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
340
};
341
342
struct common_params_speculative_ngram_cache {
343
    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
344
    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
345
};
346
347
struct common_params_speculative {
348
    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
349
350
    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
351
    common_params_speculative_draft draft;
352
353
    common_params_speculative_ngram_mod ngram_mod;
354
    common_params_speculative_ngram_map ngram_simple;
355
    common_params_speculative_ngram_map ngram_map_k;
356
    common_params_speculative_ngram_map ngram_map_k4v;
357
358
    common_params_speculative_ngram_cache ngram_cache;
359
360
0
    bool has_dft() const {
361
0
        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
362
0
    }
363
364
0
    uint32_t need_n_rs_seq() const {
365
0
        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
366
0
            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
367
0
        });
368
369
0
        return needs_rs_seq ? draft.n_max : 0u;
370
0
    }
371
};
372
373
struct common_params_vocoder {
374
    struct common_params_model model;
375
376
    std::string speaker_file; // speaker file path
377
378
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
379
};
380
381
struct common_params_diffusion {
382
    int32_t steps         = 128;
383
    bool    visual_mode   = false;
384
385
    float   eps           = 0;        // epsilon for timesteps
386
    int32_t block_length  = 0;        // block length for generation
387
388
    int32_t algorithm     = 4;        // default algorithm: low-confidence
389
    float   alg_temp      = 0.0f;     // algorithm temperature
390
391
    float   cfg_scale     = 0;        // classifier-free guidance scale
392
    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
393
};
394
395
// reasoning API response format (not to be confused as chat template's reasoning format)
396
// only used by server
397
enum common_reasoning_format {
398
    COMMON_REASONING_FORMAT_NONE,
399
    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
400
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
401
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
402
    // do not extend this enum unless you absolutely have to
403
    // in most cases, use COMMON_REASONING_FORMAT_AUTO
404
    // see: https://github.com/ggml-org/llama.cpp/pull/15408
405
};
406
407
408
struct lr_opt {
409
    float    lr0          = 1e-5; // learning rate at first epoch
410
    float    lr_min       = -1;
411
    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
412
    float    scale_epoch  = 0;
413
    float    wd           = 0;
414
    unsigned epochs       = 2;
415
416
    unsigned epoch; // set by optimizer outer (epochs) loop
417
    // learning rate decay - constant LR per epoch only for now
418
    float get_lr(float e) const;
419
0
    float get_lr() const { return get_lr(epoch); }
420
    // must call after arg parse, before get_lr
421
    void init();
422
};
423
424
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
425
426
struct common_params {
427
    int32_t n_predict             =    -1; // max. number of new tokens to predict, -1 == no limit
428
    int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
429
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
430
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
431
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
432
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
433
    int32_t n_parallel            =     1; // number of parallel sequences to decode
434
    int32_t n_sequences           =     1; // number of sequences to decode
435
    int32_t n_outputs_max         =     0; // max outputs in a batch (0 = n_batch)
436
    int32_t grp_attn_n            =     1; // group-attention factor
437
    int32_t grp_attn_w            =   512; // group-attention width
438
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
439
    float   rope_freq_base        =  0.0f; // RoPE base frequency
440
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
441
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
442
    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
443
    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
444
    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
445
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
446
447
    // offload params
448
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
449
450
    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
451
    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
452
    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
453
    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
454
    bool    fit_params_print   = false; // print the estimated required memory to run the model
455
    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
456
457
    // margin per device in bytes for fitting parameters to free memory:
458
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
459
460
    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
461
462
    common_cpu_params cpuparams;
463
    common_cpu_params cpuparams_batch;
464
465
    ggml_backend_sched_eval_callback cb_eval = nullptr;
466
    void * cb_eval_user_data                 = nullptr;
467
468
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
469
470
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
471
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
472
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
473
    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
474
475
    struct common_params_sampling    sampling;
476
    struct common_params_speculative speculative;
477
    struct common_params_vocoder     vocoder;
478
    struct common_params_diffusion   diffusion;
479
480
    struct common_params_model model;
481
482
    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
483
    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
484
    std::string hf_token             = ""; // HF token (aka bearer token)                                   // NOLINT
485
    std::string prompt               = "";                                                                  // NOLINT
486
    std::string system_prompt        = "";                                                                  // NOLINT
487
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
488
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
489
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
490
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
491
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
492
    std::string path_prompts_log_dir = ""; // directory with logged prompts                                 // NOLINT
493
494
    // llama-debug specific options
495
    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
496
    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
497
    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
498
499
    std::vector<std::string> in_files;   // all input files
500
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
501
    std::vector<llama_model_kv_override> kv_overrides;
502
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
503
504
    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
505
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
506
507
    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
508
509
    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
510
    int32_t control_vector_layer_start = -1; // layer range for control vector
511
    int32_t control_vector_layer_end   = -1; // layer range for control vector
512
    bool    offline                    = false;
513
    bool    skip_download              = false; // skip model file downloading
514
515
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
516
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
517
                                     //                                       (which is more convenient to use for plotting)
518
                                     //
519
    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
520
    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
521
522
    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
523
    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
524
525
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
526
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
527
528
    bool   kl_divergence    = false; // compute KL divergence
529
530
    bool check             = false; // check rather than generate results for llama-results
531
532
    bool usage             = false; // print usage
533
    bool completion        = false; // print source-able completion script
534
    bool use_color         = false; // use color to distinguish generations and inputs
535
    bool special           = false; // enable special token output
536
    bool interactive       = false; // interactive mode
537
    bool interactive_first = false; // wait for user input immediately
538
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
539
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
540
541
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
542
    bool multiline_input   = false; // reverse the usage of `\`
543
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
544
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
545
    bool no_perf           = false; // disable performance metrics
546
    bool show_timings      = true;  // show timing information on CLI
547
    bool ctx_shift         = false; // context shift on infinite text generation
548
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
549
    bool kv_unified        = false; // enable unified KV cache
550
551
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
552
    bool use_mmap          = true;  // enable mmap to use filesystem cache
553
    bool use_direct_io     = false; // read from disk without buffering
554
    bool use_mlock         = false; // use mlock to keep model in memory
555
    bool verbose_prompt    = false; // print prompt tokens before generation
556
    bool display_prompt    = true;  // print prompt before generation
557
    bool no_kv_offload     = false; // disable KV offloading
558
    bool warmup            = true;  // warmup run
559
    bool check_tensors     = false; // validate tensor data
560
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
561
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
562
    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
563
564
    bool single_turn       = false; // single turn chat conversation
565
566
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
567
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
568
569
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
570
571
    // multimodal models (see tools/mtmd)
572
    struct common_params_model mmproj;
573
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
574
    bool no_mmproj = false;         // explicitly disable multimodal model
575
    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
576
    int image_min_tokens = -1;
577
    int image_max_tokens = -1;
578
    int mtmd_batch_max_tokens = 1024;
579
580
    // finetune
581
    struct lr_opt lr;
582
    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
583
    float val_split = 0.05f; // fraction of the data used for the validation set
584
585
    // embedding
586
    bool embedding         = false; // get only sentence embedding
587
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
588
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
589
    std::string embd_sep   = "\n";  // separator of embeddings
590
    std::string cls_sep    = "\t";  // separator of classification sequences
591
592
    // server params
593
    int32_t port                = 8080;          // server listens on this network port
594
    bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
595
    int32_t timeout_read        = 3600;          // http read timeout in seconds
596
    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
597
    int32_t sse_ping_interval   = 30;            // SSE ping interval in seconds
598
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
599
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
600
    bool    cache_prompt        = true;  // whether to enable prompt caching
601
    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
602
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
603
    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
604
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
605
606
    std::string hostname      = "127.0.0.1";
607
    std::string public_path   = "";                                                                         // NOLINT
608
    std::string api_prefix    = "";                                                                         // NOLINT
609
    std::string chat_template = "";                                                                         // NOLINT
610
    bool use_jinja = true;                                                                                  // NOLINT
611
    bool enable_chat_template = true;
612
    bool force_pure_content_parser = false;
613
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
614
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
615
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
616
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
617
618
    std::vector<std::string> api_keys;
619
620
    std::string ssl_file_key  = "";                                                                         // NOLINT
621
    std::string ssl_file_cert = "";                                                                         // NOLINT
622
623
    std::map<std::string, std::string> default_template_kwargs;
624
625
    // UI configs
626
    bool ui = true;
627
628
    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
629
    bool webui = ui;
630
    bool webui_mcp_proxy = false;
631
    std::string webui_config_json;
632
633
    bool ui_mcp_proxy = false;
634
    std::string ui_config_json;
635
636
    // "advanced" endpoints are disabled by default for better security
637
    bool endpoint_slots   = true;
638
    bool endpoint_props   = false; // only control POST requests, not GET
639
    bool endpoint_metrics = false;
640
641
    // enable built-in tools
642
    std::vector<std::string> server_tools;
643
644
    // router server configs
645
    std::string models_dir    = ""; // directory containing models for the router server
646
    std::string models_preset = ""; // directory containing model presets for the router server
647
    int models_max = 4;             // maximum number of models to load simultaneously
648
    bool models_autoload = true;    // automatically load models when requested via the router server
649
650
    bool log_json = false;
651
652
    std::string slot_save_path;
653
    std::string media_path; // path to directory for loading media files
654
655
    float slot_prompt_similarity = 0.1f;
656
657
    // batched-bench params
658
    bool is_pp_shared   = false;
659
    bool is_tg_separate = false;
660
661
    std::vector<int32_t> n_pp;
662
    std::vector<int32_t> n_tg;
663
    std::vector<int32_t> n_pl;
664
665
    // retrieval params
666
    std::vector<std::string> context_files; // context files to embed
667
668
    int32_t chunk_size = 64; // chunk size for context embedding
669
670
    std::string chunk_separator = "\n"; // chunk separator for context embedding
671
672
    // passkey params
673
    int32_t n_junk = 250; // number of times to repeat the junk text
674
    int32_t i_pos  = -1;  // position of the passkey in the junk text
675
676
    // imatrix params
677
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
678
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
679
    int32_t i_chunk     =  0; // start processing from this chunk
680
    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
681
682
    bool process_output  = false; // collect data for the output tensor
683
    bool compute_ppl     = true;  // whether to compute perplexity
684
    bool show_statistics = false; // show imatrix statistics per tensor
685
    bool parse_special   = false; // whether to parse special tokens during imatrix tokenization
686
687
    // cvector-generator params
688
    int n_pca_batch = 100;
689
    int n_pca_iterations = 1000;
690
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
691
    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
692
    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
693
694
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
695
696
    // batched-bench params
697
    bool batched_bench_output_jsonl = false;
698
699
    // common params
700
    std::string out_file; // output filename for all example programs
701
    // optional callback for model loading progress and cancellation:
702
    // called with a progress value between 0.0 and 1.0.
703
    // return false from callback to abort model loading or true to continue
704
    llama_progress_callback load_progress_callback = NULL;
705
    void *                  load_progress_callback_user_data = NULL;
706
    bool no_alloc = false; // Don't allocate model buffers
707
};
708
709
// call once at the start of a program if it uses libcommon
710
// initializes the logging system and prints info about the build
711
void common_init();
712
713
void common_params_print_info(const common_params & params, bool print_devices = true);
714
std::string common_params_get_system_info(const common_params & params);
715
716
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
717
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
718
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
719
bool set_process_priority(enum ggml_sched_priority prio);
720
721
//
722
// String utils
723
//
724
725
#ifdef __GNUC__
726
#    if defined(__MINGW32__) && !defined(__clang__)
727
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
728
#    else
729
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
730
#    endif
731
#else
732
#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
733
#endif
734
735
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
736
std::string string_format(const char * fmt, ...);
737
738
std::string string_strip(const std::string & str);
739
std::string string_get_sortable_timestamp();
740
std::string string_lcs(std::string_view a, std::string_view b);
741
742
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
743
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
744
std::string string_repeat(const std::string & str, size_t n);
745
746
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
747
748
std::string regex_escape(const std::string & s);
749
750
template<class T>
751
static std::vector<T> string_split(const std::string & str, char delim) {
752
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
753
    std::vector<T> values;
754
    std::istringstream str_stream(str);
755
    std::string token;
756
    while (std::getline(str_stream, token, delim)) {
757
        T value;
758
        std::istringstream token_stream(token);
759
        token_stream >> value;
760
        values.push_back(value);
761
    }
762
    return values;
763
}
764
765
template<>
766
inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
767
0
{
768
0
    std::vector<std::string> parts;
769
0
    size_t begin_pos = 0;
770
0
    size_t delim_pos = str.find(delim);
771
0
    while (delim_pos != std::string::npos) {
772
0
        std::string part = str.substr(begin_pos, delim_pos - begin_pos);
773
0
        parts.emplace_back(part);
774
0
        begin_pos = delim_pos + 1;
775
0
        delim_pos = str.find(delim, begin_pos);
776
0
    }
777
0
    parts.emplace_back(str.substr(begin_pos));
778
0
    return parts;
779
0
}
Unexecuted instantiation: json-schema-to-grammar.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: common.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: log.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: sampling.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: reasoning-budget.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
780
781
// remove when moving to c++20
782
0
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
783
0
    return str.size() >= prefix.size() &&
784
0
           str.compare(0, prefix.size(), prefix) == 0;
785
0
}
786
787
// remove when moving to c++20
788
0
inline bool string_starts_with(std::string_view str, char prefix) {
789
0
    return !str.empty() && str.front() == prefix;
790
0
}
791
792
// remove when moving to c++20
793
0
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
794
0
    return str.size() >= suffix.size() &&
795
0
           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
796
0
}
797
798
0
inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
799
0
    if (string_ends_with(str, suffix)) {
800
0
        str.resize(str.size() - suffix.size());
801
0
        return true;
802
0
    }
803
0
    return false;
804
0
}
805
806
0
inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
807
0
    if (!str.empty() && !stop.empty()) {
808
0
        const size_t max_len = std::min(str.size(), stop.size());
809
0
        const char last_char = str.back();
810
0
        for (size_t len = max_len; len > 0; --len) {
811
0
            if (stop[len - 1] == last_char) {
812
0
                if (string_ends_with(str, stop.substr(0, len))) {
813
0
                    return str.size() - len;
814
0
                }
815
0
            }
816
0
        }
817
0
    }
818
0
    return std::string::npos;
819
0
}
820
821
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
822
void string_process_escapes(std::string & input);
823
824
std::string string_from(bool value);
825
std::string string_from(const std::vector<int> & values);
826
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
827
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
828
829
bool glob_match(const std::string & pattern, const std::string & str);
830
831
//
832
// Filesystem utils
833
//
834
835
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
836
bool fs_create_directory_with_parents(const std::string & path);
837
bool fs_is_directory(const std::string & path);
838
839
std::string fs_get_cache_directory();
840
std::string fs_get_cache_file(const std::string & filename);
841
842
struct common_file_info {
843
    std::string path;
844
    std::string name;
845
    size_t      size = 0; // in bytes
846
    bool        is_dir = false;
847
};
848
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
849
850
//
851
// TTY utils
852
//
853
854
// Auto-detect if colors can be enabled based on terminal and environment
855
bool tty_can_use_colors();
856
857
//
858
// Model utils
859
//
860
861
struct common_sampler;
862
863
// note: defines the model, context, samplers, ets. lifetimes
864
struct common_init_result {
865
    common_init_result(common_params & params, bool model_only = false);
866
    ~common_init_result();
867
868
    llama_model * model();
869
    llama_context * context();
870
871
    common_sampler * sampler(llama_seq_id seq_id);
872
    void reset_samplers();
873
874
    std::vector<llama_adapter_lora_ptr> & lora();
875
876
private:
877
    struct impl;
878
    std::unique_ptr<impl> pimpl;
879
};
880
881
using common_init_result_ptr = std::unique_ptr<common_init_result>;
882
883
common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
884
885
struct llama_model_params     common_model_params_to_llama  (      common_params & params);
886
struct llama_context_params   common_context_params_to_llama(const common_params & params);
887
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
888
889
// clear LoRA adapters from context, then apply new list of adapters
890
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
891
892
// model endpoint from env
893
std::string common_get_model_endpoint();
894
895
//
896
// Context utils
897
//
898
899
enum common_context_seq_rm_type {
900
    COMMON_CONTEXT_SEQ_RM_TYPE_NO           = 0, // seq_rm not supported (e.g. no memory module)
901
    COMMON_CONTEXT_SEQ_RM_TYPE_PART         = 1, // can seq_rm partial sequences
902
    COMMON_CONTEXT_SEQ_RM_TYPE_FULL         = 2, // can seq_rm full sequences only
903
    COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq
904
};
905
906
// check if the llama_context can remove sequences
907
// note: clears the memory of the context
908
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
909
910
// aborts execution on failure
911
void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
912
void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
913
void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
914
915
//
916
// Batch utils
917
//
918
919
void common_batch_clear(struct llama_batch & batch);
920
921
void common_batch_add(
922
                 struct llama_batch & batch,
923
                        llama_token   id,
924
                          llama_pos   pos,
925
    const std::vector<llama_seq_id> & seq_ids,
926
                               bool   logits);
927
928
// decodes a single batch of tokens for a prompt and manages session tokens
929
//
930
// Note: We save state before the last token so that we can replay it to ensure
931
// compatibility with all memory types. Recurrent/hybrid models cannot remove
932
// tokens from memory, so this approach works across all model architectures.
933
bool common_prompt_batch_decode(
934
              struct llama_context * ctx,
935
    const std::vector<llama_token> & all_tokens,
936
                               int   n_new,
937
                               int & n_past,
938
                               int   n_batch,
939
                  std::string_view   state_path,
940
                              bool   save_state);
941
942
// replays the last token after loading state to regenerate logits
943
// used after loading session state to ensure the sampling context has valid logits
944
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
945
946
//
947
// Vocab utils
948
//
949
950
// tokenizes a string into a vector of tokens
951
// should work similar to Python's `tokenizer.encode`
952
std::vector<llama_token> common_tokenize(
953
  const struct llama_context * ctx,
954
           const std::string & text,
955
                        bool   add_special,
956
                        bool   parse_special = false);
957
958
std::vector<llama_token> common_tokenize(
959
    const struct llama_vocab * vocab,
960
           const std::string & text,
961
                        bool   add_special,
962
                        bool   parse_special = false);
963
964
// tokenizes a token into a piece, optionally renders special/control tokens
965
// should work similar to Python's `tokenizer.id_to_piece`
966
std::string common_token_to_piece(
967
        const struct llama_context * ctx,
968
                       llama_token   token,
969
                       bool          special = true);
970
971
std::string common_token_to_piece(
972
          const struct llama_vocab * vocab,
973
                       llama_token   token,
974
                       bool          special = true);
975
976
// detokenizes a vector of tokens into a string
977
// should work similar to Python's `tokenizer.decode`
978
// optionally renders special/control tokens
979
std::string common_detokenize(
980
            const struct llama_context * ctx,
981
        const std::vector<llama_token> & tokens,
982
                                  bool   special = true);
983
984
std::string common_detokenize(
985
              const struct llama_vocab * vocab,
986
        const std::vector<llama_token> & tokens,
987
                                  bool   special = true);
988
989
//
990
// Embedding utils
991
//
992
993
// TODO: replace embd_norm with an enum
994
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
995
996
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
997
998
//
999
// Control vector utils
1000
//
1001
1002
struct common_control_vector_data {
1003
    int n_embd;
1004
1005
    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
1006
    std::vector<float> data;
1007
};
1008
1009
struct common_control_vector_load_info {
1010
    float strength;
1011
1012
    std::string fname;
1013
};
1014
1015
// Load control vectors, scale each by strength, and add them together.
1016
// On error, returns {-1, empty}
1017
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
1018
1019
//
1020
// Split utils
1021
//
1022
1023
namespace {
1024
1025
const char * const LLM_KV_SPLIT_NO            = "split.no";
1026
const char * const LLM_KV_SPLIT_COUNT         = "split.count";
1027
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
1028
1029
}
1030
1031
//
1032
// MoE utils
1033
//
1034
1035
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
1036
1037
0
inline std::string llm_ffn_exps_block_regex(int idx) {
1038
0
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
1039
0
}
1040
1041
0
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
1042
0
    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
1043
0
}
1044
1045
//
1046
// training utils
1047
//
1048
1049
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
1050
1051
// "adamw" or "sgd" (case insensitive)
1052
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
1053
1054
//
1055
// prompt utils
1056
//
1057
1058
struct common_prompt_checkpoint {
1059
    int64_t n_tokens;
1060
1061
    llama_pos pos_min;
1062
    llama_pos pos_max;
1063
1064
    std::vector<uint8_t> data_tgt;
1065
    std::vector<uint8_t> data_dft;
1066
1067
    size_t size() const;
1068
1069
    bool empty() const;
1070
    void clear();
1071
1072
    void update_pos(
1073
            int64_t n_tokens,
1074
            llama_pos pos_min,
1075
            llama_pos pos_max);
1076
1077
    void update_tgt(
1078
            llama_context * ctx,
1079
            llama_seq_id seq_id,
1080
            llama_state_seq_flags flags);
1081
1082
    void update_dft(
1083
            llama_context * ctx,
1084
            llama_seq_id seq_id,
1085
            llama_state_seq_flags flags);
1086
1087
    void load_tgt(
1088
            llama_context * ctx,
1089
            llama_seq_id seq_id,
1090
            llama_state_seq_flags flags) const;
1091
1092
    void load_dft(
1093
            llama_context * ctx,
1094
            llama_seq_id seq_id,
1095
            llama_state_seq_flags flags) const;
1096
1097
    void clear_tgt();
1098
    void clear_dft();
1099
};