Coverage Report

Created: 2025-11-28 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/common.h
Line
Count
Source
1
// Various helper functions and utilities
2
3
#pragma once
4
5
#include "ggml-opt.h"
6
#include "llama-cpp.h"
7
8
#include <set>
9
#include <sstream>
10
#include <string>
11
#include <string_view>
12
#include <vector>
13
#include <map>
14
15
#ifdef _WIN32
16
#define DIRECTORY_SEPARATOR '\\'
17
#else
18
0
#define DIRECTORY_SEPARATOR '/'
19
#endif // _WIN32
20
21
#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
22
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
23
24
#define print_build_info() do {                                                                     \
25
    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
26
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
27
} while(0)
28
29
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
30
31
struct common_time_meas {
32
    common_time_meas(int64_t & t_acc, bool disable = false);
33
    ~common_time_meas();
34
35
    const int64_t t_start_us;
36
37
    int64_t & t_acc;
38
};
39
40
struct common_adapter_lora_info {
41
    std::string path;
42
    float scale;
43
44
    std::string task_name;
45
    std::string prompt_prefix;
46
47
    struct llama_adapter_lora * ptr;
48
};
49
50
using llama_tokens = std::vector<llama_token>;
51
52
// build info
53
extern int LLAMA_BUILD_NUMBER;
54
extern const char * LLAMA_COMMIT;
55
extern const char * LLAMA_COMPILER;
56
extern const char * LLAMA_BUILD_TARGET;
57
58
struct common_control_vector_load_info;
59
60
//
61
// CPU utils
62
//
63
64
struct cpu_params {
65
    int      n_threads                   = -1;
66
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
67
    bool     mask_valid                  = false;   // Default: any CPU
68
    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
69
    bool     strict_cpu                  = false;   // Use strict CPU placement
70
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
71
};
72
73
int32_t cpu_get_num_physical_cores();
74
int32_t cpu_get_num_math();
75
76
//
77
// Common params
78
//
79
80
enum llama_example {
81
    LLAMA_EXAMPLE_COMMON,
82
    LLAMA_EXAMPLE_SPECULATIVE,
83
    LLAMA_EXAMPLE_MAIN,
84
    LLAMA_EXAMPLE_EMBEDDING,
85
    LLAMA_EXAMPLE_PERPLEXITY,
86
    LLAMA_EXAMPLE_RETRIEVAL,
87
    LLAMA_EXAMPLE_PASSKEY,
88
    LLAMA_EXAMPLE_IMATRIX,
89
    LLAMA_EXAMPLE_BENCH,
90
    LLAMA_EXAMPLE_SERVER,
91
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
92
    LLAMA_EXAMPLE_EXPORT_LORA,
93
    LLAMA_EXAMPLE_MTMD,
94
    LLAMA_EXAMPLE_LOOKUP,
95
    LLAMA_EXAMPLE_PARALLEL,
96
    LLAMA_EXAMPLE_TTS,
97
    LLAMA_EXAMPLE_DIFFUSION,
98
    LLAMA_EXAMPLE_FINETUNE,
99
100
    LLAMA_EXAMPLE_COUNT,
101
};
102
103
enum common_sampler_type {
104
    COMMON_SAMPLER_TYPE_NONE        = 0,
105
    COMMON_SAMPLER_TYPE_DRY         = 1,
106
    COMMON_SAMPLER_TYPE_TOP_K       = 2,
107
    COMMON_SAMPLER_TYPE_TOP_P       = 3,
108
    COMMON_SAMPLER_TYPE_MIN_P       = 4,
109
  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
110
    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
111
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
112
    COMMON_SAMPLER_TYPE_XTC         = 8,
113
    COMMON_SAMPLER_TYPE_INFILL      = 9,
114
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
115
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
116
};
117
118
// dimensionality reduction methods, used by cvector-generator
119
enum dimre_method {
120
    DIMRE_METHOD_PCA,
121
    DIMRE_METHOD_MEAN,
122
};
123
124
enum common_conversation_mode {
125
    COMMON_CONVERSATION_MODE_DISABLED = 0,
126
    COMMON_CONVERSATION_MODE_ENABLED  = 1,
127
    COMMON_CONVERSATION_MODE_AUTO     = 2,
128
};
129
130
enum common_grammar_trigger_type {
131
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
132
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
133
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
134
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
135
};
136
137
struct common_grammar_trigger {
138
    common_grammar_trigger_type type;
139
    std::string value;
140
    llama_token token = LLAMA_TOKEN_NULL;
141
};
142
143
enum common_params_sampling_config : uint64_t {
144
    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
145
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
146
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
147
    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
148
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
149
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
150
    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
151
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
152
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
153
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
154
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
155
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
156
};
157
158
159
// sampling parameters
160
struct common_params_sampling {
161
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
162
163
    int32_t n_prev             = 64;    // number of previous tokens to remember
164
    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
165
    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
166
    int32_t top_k              = 40;    // <= 0 to use vocab size
167
    float   top_p              = 0.95f; // 1.0 = disabled
168
    float   min_p              = 0.05f; // 0.0 = disabled
169
    float   xtc_probability    = 0.00f; // 0.0 = disabled
170
    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
171
    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
172
    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
173
    float   dynatemp_range     = 0.00f; // 0.0 = disabled
174
    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
175
    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
176
    float   penalty_repeat     = 1.00f; // 1.0 = disabled
177
    float   penalty_freq       = 0.00f; // 0.0 = disabled
178
    float   penalty_present    = 0.00f; // 0.0 = disabled
179
    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
180
    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
181
    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
182
    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
183
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
184
    float   top_n_sigma        = -1.00f;// -1.0 = disabled
185
    float   mirostat_tau       = 5.00f; // target entropy
186
    float   mirostat_eta       = 0.10f; // learning rate
187
    bool    ignore_eos         = false;
188
    bool    no_perf            = false; // disable performance metrics
189
    bool    timing_per_token   = false;
190
191
    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
192
193
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
194
195
196
    std::vector<enum common_sampler_type> samplers = {
197
        COMMON_SAMPLER_TYPE_PENALTIES,
198
        COMMON_SAMPLER_TYPE_DRY,
199
        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
200
        COMMON_SAMPLER_TYPE_TOP_K,
201
        COMMON_SAMPLER_TYPE_TYPICAL_P,
202
        COMMON_SAMPLER_TYPE_TOP_P,
203
        COMMON_SAMPLER_TYPE_MIN_P,
204
        COMMON_SAMPLER_TYPE_XTC,
205
        COMMON_SAMPLER_TYPE_TEMPERATURE,
206
    };
207
208
    std::string                         grammar; // optional BNF-like grammar to constrain sampling
209
    bool                                grammar_lazy = false;
210
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
211
    std::set<llama_token>               preserved_tokens;
212
213
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
214
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
215
216
    // print the parameters into a string
217
    std::string print() const;
218
};
219
220
struct common_params_model {
221
    std::string path        = ""; // model local path                                       // NOLINT
222
    std::string url         = ""; // model url to download                                  // NOLINT
223
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
224
    std::string hf_file     = ""; // HF file                                                // NOLINT
225
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
226
};
227
228
struct common_params_speculative {
229
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
230
231
    int32_t n_ctx        =     0; // draft context size
232
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
233
    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
234
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
235
    float   p_split      =  0.1f; // speculative decoding split probability
236
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
237
    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
238
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
239
240
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
241
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
242
243
    struct cpu_params cpuparams;
244
    struct cpu_params cpuparams_batch;
245
246
    struct common_params_model model;
247
};
248
249
struct common_params_vocoder {
250
    struct common_params_model model;
251
252
    std::string speaker_file = ""; // speaker file path                                      // NOLINT
253
254
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
255
};
256
257
struct common_params_diffusion {
258
    int32_t steps         = 128;
259
    bool    visual_mode   = false;
260
261
    float   eps           = 0;        // epsilon for timesteps
262
    int32_t block_length  = 0;        // block length for generation
263
264
    int32_t algorithm     = 4;        // default algorithm: low-confidence
265
    float   alg_temp      = 0.0f;     // algorithm temperature
266
267
    float   cfg_scale     = 0;        // classifier-free guidance scale
268
    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
269
};
270
271
// reasoning API response format (not to be confused as chat template's reasoning format)
272
enum common_reasoning_format {
273
    COMMON_REASONING_FORMAT_NONE,
274
    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
275
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
276
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
277
    // do not extend this enum unless you absolutely have to
278
    // in most cases, use COMMON_REASONING_FORMAT_AUTO
279
    // see: https://github.com/ggml-org/llama.cpp/pull/15408
280
};
281
282
283
struct lr_opt {
284
    float    lr0          = 1e-5; // learning rate at first epoch
285
    float    lr_min       = -1;
286
    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
287
    float    scale_epoch  = 0;
288
    float    wd           = 0;
289
    unsigned epochs       = 2;
290
291
    unsigned epoch; // set by optimizer outer (epochs) loop
292
    // learning rate decay - constant LR per epoch only for now
293
    float get_lr(float e) const;
294
0
    float get_lr() const { return get_lr(epoch); }
295
    // must call after arg parse, before get_lr
296
    void init();
297
};
298
299
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
300
301
struct common_params {
302
    int32_t n_predict             =    -1; // new tokens to predict
303
    int32_t n_ctx                 =  4096; // context size
304
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
305
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
306
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
307
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
308
    int32_t n_parallel            =     1; // number of parallel sequences to decode
309
    int32_t n_sequences           =     1; // number of sequences to decode
310
    int32_t grp_attn_n            =     1; // group-attention factor
311
    int32_t grp_attn_w            =   512; // group-attention width
312
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
313
    float   rope_freq_base        =  0.0f; // RoPE base frequency
314
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
315
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
316
    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
317
    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
318
    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
319
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
320
321
    // offload params
322
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
323
324
    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
325
    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
326
    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
327
328
    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
329
330
    struct cpu_params cpuparams;
331
    struct cpu_params cpuparams_batch;
332
333
    ggml_backend_sched_eval_callback cb_eval = nullptr;
334
    void * cb_eval_user_data                 = nullptr;
335
336
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
337
338
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
339
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
340
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
341
    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
342
343
    struct common_params_sampling    sampling;
344
    struct common_params_speculative speculative;
345
    struct common_params_vocoder     vocoder;
346
    struct common_params_diffusion   diffusion;
347
348
    struct common_params_model model;
349
350
    std::string model_alias          = ""; // model alias                                                   // NOLINT
351
    std::string hf_token             = ""; // HF token                                                      // NOLINT
352
    std::string prompt               = "";                                                                  // NOLINT
353
    std::string system_prompt        = "";                                                                  // NOLINT
354
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
355
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
356
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
357
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
358
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
359
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
360
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
361
362
    std::vector<std::string> in_files;   // all input files
363
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
364
    std::vector<llama_model_kv_override> kv_overrides;
365
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
366
367
    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
368
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
369
370
    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
371
372
    int32_t verbosity                  = 0;
373
    int32_t control_vector_layer_start = -1; // layer range for control vector
374
    int32_t control_vector_layer_end   = -1; // layer range for control vector
375
    bool    offline                    = false;
376
377
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
378
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
379
                                     //                                       (which is more convenient to use for plotting)
380
                                     //
381
    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
382
    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
383
384
    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
385
    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
386
387
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
388
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
389
390
    bool   kl_divergence    = false; // compute KL divergence
391
392
    bool usage             = false; // print usage
393
    bool completion        = false; // print source-able completion script
394
    bool use_color         = false; // use color to distinguish generations and inputs
395
    bool special           = false; // enable special token output
396
    bool interactive       = false; // interactive mode
397
    bool interactive_first = false; // wait for user input immediately
398
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
399
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
400
401
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
402
    bool multiline_input   = false; // reverse the usage of `\`
403
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
404
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
405
    bool no_perf           = false; // disable performance metrics
406
    bool ctx_shift         = false; // context shift on infinite text generation
407
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
408
    bool kv_unified        = false; // enable unified KV cache
409
410
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
411
    bool use_mmap          = true;  // use mmap for faster loads
412
    bool use_mlock         = false; // use mlock to keep model in memory
413
    bool verbose_prompt    = false; // print prompt tokens before generation
414
    bool display_prompt    = true;  // print prompt before generation
415
    bool no_kv_offload     = false; // disable KV offloading
416
    bool warmup            = true;  // warmup run
417
    bool check_tensors     = false; // validate tensor data
418
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
419
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
420
    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
421
422
    bool single_turn       = false; // single turn chat conversation
423
424
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
425
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
426
427
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
428
429
    // multimodal models (see tools/mtmd)
430
    struct common_params_model mmproj;
431
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
432
    bool no_mmproj = false;         // explicitly disable multimodal model
433
    std::vector<std::string> image; // path to image file(s)
434
    int image_min_tokens = -1;
435
    int image_max_tokens = -1;
436
437
    // finetune
438
    struct lr_opt lr;
439
    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
440
    float val_split = 0.05f; // fraction of the data used for the validation set
441
442
    // embedding
443
    bool embedding         = false; // get only sentence embedding
444
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
445
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
446
    std::string embd_sep   = "\n";  // separator of embeddings
447
    std::string cls_sep    = "\t";  // separator of classification sequences
448
449
    // server params
450
    int32_t port              = 8080;         // server listens on this network port
451
    int32_t timeout_read      = 600;          // http read timeout in seconds
452
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
453
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
454
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
455
    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
456
    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
457
458
    std::string hostname      = "127.0.0.1";
459
    std::string public_path   = "";                                                                         // NOLINT
460
    std::string api_prefix    = "";                                                                         // NOLINT
461
    std::string chat_template = "";                                                                         // NOLINT
462
    bool use_jinja = false;                                                                                 // NOLINT
463
    bool enable_chat_template = true;
464
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
465
    int reasoning_budget = -1;
466
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
467
468
    std::vector<std::string> api_keys;
469
470
    std::string ssl_file_key  = "";                                                                         // NOLINT
471
    std::string ssl_file_cert = "";                                                                         // NOLINT
472
473
    std::map<std::string, std::string> default_template_kwargs;
474
475
    // "advanced" endpoints are disabled by default for better security
476
    bool webui            = true;
477
    bool endpoint_slots   = true;
478
    bool endpoint_props   = false; // only control POST requests, not GET
479
    bool endpoint_metrics = false;
480
481
    bool log_json = false;
482
483
    std::string slot_save_path;
484
485
    float slot_prompt_similarity = 0.1f;
486
487
    // batched-bench params
488
    bool is_pp_shared   = false;
489
    bool is_tg_separate = false;
490
491
    std::vector<int32_t> n_pp;
492
    std::vector<int32_t> n_tg;
493
    std::vector<int32_t> n_pl;
494
495
    // retrieval params
496
    std::vector<std::string> context_files; // context files to embed
497
498
    int32_t chunk_size = 64; // chunk size for context embedding
499
500
    std::string chunk_separator = "\n"; // chunk separator for context embedding
501
502
    // passkey params
503
    int32_t n_junk = 250; // number of times to repeat the junk text
504
    int32_t i_pos  = -1;  // position of the passkey in the junk text
505
506
    // imatrix params
507
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
508
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
509
    int32_t i_chunk     =  0; // start processing from this chunk
510
    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
511
512
    bool process_output  = false; // collect data for the output tensor
513
    bool compute_ppl     = true;  // whether to compute perplexity
514
    bool show_statistics = false; // show imatrix statistics per tensor
515
    bool parse_special   = false; // whether to parse special tokens during imatrix tokenization
516
517
    // cvector-generator params
518
    int n_pca_batch = 100;
519
    int n_pca_iterations = 1000;
520
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
521
    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
522
    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
523
524
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
525
526
    // batched-bench params
527
    bool batched_bench_output_jsonl = false;
528
529
    // common params
530
    std::string out_file; // output filename for all example programs
531
    // optional callback for model loading progress and cancellation:
532
    // called with a progress value between 0.0 and 1.0.
533
    // return false from callback to abort model loading or true to continue
534
    llama_progress_callback load_progress_callback = NULL;
535
    void *                  load_progress_callback_user_data = NULL;
536
537
0
    bool has_speculative() const {
538
0
        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
539
0
    }
540
};
541
542
// call once at the start of a program if it uses libcommon
543
// initializes the logging system and prints info about the build
544
void common_init();
545
546
std::string common_params_get_system_info(const common_params & params);
547
548
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
549
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
550
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
551
bool set_process_priority(enum ggml_sched_priority prio);
552
553
//
554
// String utils
555
//
556
557
#ifdef __GNUC__
558
#    if defined(__MINGW32__) && !defined(__clang__)
559
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
560
#    else
561
#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
562
#    endif
563
#else
564
#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
565
#endif
566
567
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
568
std::string string_format(const char * fmt, ...);
569
570
std::string string_strip(const std::string & str);
571
std::string string_get_sortable_timestamp();
572
573
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
574
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
575
std::string string_repeat(const std::string & str, size_t n);
576
577
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
578
579
std::string regex_escape(const std::string & s);
580
581
template<class T>
582
static std::vector<T> string_split(const std::string & str, char delim) {
583
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
584
    std::vector<T> values;
585
    std::istringstream str_stream(str);
586
    std::string token;
587
    while (std::getline(str_stream, token, delim)) {
588
        T value;
589
        std::istringstream token_stream(token);
590
        token_stream >> value;
591
        values.push_back(value);
592
    }
593
    return values;
594
}
595
596
template<>
597
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
598
0
{
599
0
    std::vector<std::string> parts;
600
0
    size_t begin_pos = 0;
601
0
    size_t separator_pos = input.find(separator);
602
0
    while (separator_pos != std::string::npos) {
603
0
        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
604
0
        parts.emplace_back(part);
605
0
        begin_pos = separator_pos + 1;
606
0
        separator_pos = input.find(separator, begin_pos);
607
0
    }
608
0
    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
609
0
    return parts;
610
0
}
Unexecuted instantiation: fuzz_inference.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: common.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
Unexecuted instantiation: sampling.cpp:std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > string_split<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char)
611
612
static bool string_starts_with(const std::string & str,
613
0
                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
614
0
    return str.rfind(prefix, 0) == 0;
615
0
}
Unexecuted instantiation: fuzz_inference.cpp:string_starts_with(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
Unexecuted instantiation: common.cpp:string_starts_with(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
Unexecuted instantiation: sampling.cpp:string_starts_with(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)
616
617
// While we wait for C++20's std::string::ends_with...
618
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
619
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
620
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
621
622
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
623
void string_process_escapes(std::string & input);
624
625
std::string string_from(bool value);
626
std::string string_from(const std::vector<int> & values);
627
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
628
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
629
630
//
631
// Filesystem utils
632
//
633
634
bool fs_validate_filename(const std::string & filename);
635
bool fs_create_directory_with_parents(const std::string & path);
636
637
std::string fs_get_cache_directory();
638
std::string fs_get_cache_file(const std::string & filename);
639
640
struct common_file_info {
641
    std::string path;
642
    std::string name;
643
    size_t      size = 0; // in bytes
644
};
645
std::vector<common_file_info> fs_list_files(const std::string & path);
646
647
//
648
// Model utils
649
//
650
651
// note: defines object's lifetime
652
struct common_init_result {
653
    llama_model_ptr   model;
654
    llama_context_ptr context;
655
656
    std::vector<llama_adapter_lora_ptr> lora;
657
};
658
659
struct common_init_result     common_init_from_params(common_params & params);
660
661
struct llama_model_params     common_model_params_to_llama  (      common_params & params);
662
struct llama_context_params   common_context_params_to_llama(const common_params & params);
663
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
664
665
// clear LoRA adapters from context, then apply new list of adapters
666
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
667
668
std::string                   get_model_endpoint();
669
670
//
671
// Batch utils
672
//
673
674
void common_batch_clear(struct llama_batch & batch);
675
676
void common_batch_add(
677
                 struct llama_batch & batch,
678
                        llama_token   id,
679
                          llama_pos   pos,
680
    const std::vector<llama_seq_id> & seq_ids,
681
                               bool   logits);
682
683
//
684
// Token utils
685
//
686
687
// longest common prefix
688
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
689
690
// longet common subsequence
691
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
692
693
//
694
// Vocab utils
695
//
696
697
// tokenizes a string into a vector of tokens
698
// should work similar to Python's `tokenizer.encode`
699
std::vector<llama_token> common_tokenize(
700
  const struct llama_context * ctx,
701
           const std::string & text,
702
                        bool   add_special,
703
                        bool   parse_special = false);
704
705
std::vector<llama_token> common_tokenize(
706
    const struct llama_vocab * vocab,
707
           const std::string & text,
708
                        bool   add_special,
709
                        bool   parse_special = false);
710
711
// tokenizes a token into a piece, optionally renders special/control tokens
712
// should work similar to Python's `tokenizer.id_to_piece`
713
std::string common_token_to_piece(
714
        const struct llama_context * ctx,
715
                       llama_token   token,
716
                       bool          special = true);
717
718
std::string common_token_to_piece(
719
          const struct llama_vocab * vocab,
720
                       llama_token   token,
721
                       bool          special = true);
722
723
// detokenizes a vector of tokens into a string
724
// should work similar to Python's `tokenizer.decode`
725
// optionally renders special/control tokens
726
std::string common_detokenize(
727
            const struct llama_context * ctx,
728
        const std::vector<llama_token> & tokens,
729
                                  bool   special = true);
730
731
std::string common_detokenize(
732
              const struct llama_vocab * vocab,
733
        const std::vector<llama_token> & tokens,
734
                                  bool   special = true);
735
736
//
737
// Embedding utils
738
//
739
740
// TODO: repace embd_norm with an enum
741
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
742
743
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
744
745
//
746
// Control vector utils
747
//
748
749
struct common_control_vector_data {
750
    int n_embd;
751
752
    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
753
    std::vector<float> data;
754
};
755
756
struct common_control_vector_load_info {
757
    float strength;
758
759
    std::string fname;
760
};
761
762
// Load control vectors, scale each by strength, and add them together.
763
// On error, returns {-1, empty}
764
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
765
766
//
767
// Split utils
768
//
769
770
namespace {
771
772
const char * const LLM_KV_SPLIT_NO            = "split.no";
773
const char * const LLM_KV_SPLIT_COUNT         = "split.count";
774
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
775
776
}
777
778
//
779
// MoE utils
780
//
781
782
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
783
784
0
static std::string llm_ffn_exps_block_regex(int idx) {
785
0
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
786
0
}
Unexecuted instantiation: fuzz_inference.cpp:llm_ffn_exps_block_regex(int)
Unexecuted instantiation: common.cpp:llm_ffn_exps_block_regex(int)
Unexecuted instantiation: sampling.cpp:llm_ffn_exps_block_regex(int)
787
788
0
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
789
0
    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
790
0
}
Unexecuted instantiation: fuzz_inference.cpp:llm_ffn_exps_cpu_override()
Unexecuted instantiation: common.cpp:llm_ffn_exps_cpu_override()
Unexecuted instantiation: sampling.cpp:llm_ffn_exps_cpu_override()
791
792
//
793
// training utils
794
//
795
796
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
797
798
// "adamw" or "sgd" (case insensitive)
799
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);