/src/llama.cpp/src/llama-ext.h

Source
#pragma once

// this is a staging header for new llama.cpp API
// breaking changes and C++ are allowed. everything here should be considered WIP
// try as much as possible to not include this header in the rest of the codebase

#include "llama.h"

#include <cstdint>
#include <map>

// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
        struct llama_context * ctx,
        uint32_t n_tokens,
        uint32_t n_seqs,
        uint32_t n_outputs);

// Get the default ggml_type for a given ftype.
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);

struct quantize_state_impl;

LLAMA_API quantize_state_impl * llama_quant_init(
        const llama_model * model,
        const llama_model_quantize_params * params);

LLAMA_API void llama_quant_free(quantize_state_impl * qs);

// Descriptor for constructing a mock model for quantization testing.
struct llama_quant_model_desc {
    const char * architecture;
    uint32_t n_embd;
    uint32_t n_ff;
    uint32_t n_layer;
    uint32_t n_head;
    uint32_t n_head_kv;
    uint32_t n_expert;
    uint32_t n_embd_head_k;
    uint32_t n_embd_head_v;
};

// Create a mock model from a metadata descriptor (for testing).
// The returned model must be freed with llama_model_free().
LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);

// Returns true if this tensor should be quantized (based on name, dims, params).
LLAMA_API bool llama_quant_tensor_allows_quantization(
        const quantize_state_impl * qs,
        const ggml_tensor * tensor);

// Compute quantization type assignments for a list of tensors.
// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
LLAMA_API void llama_quant_compute_types(
        quantize_state_impl * qs,
        llama_ftype ftype,
        ggml_tensor ** tensors,
        ggml_type * result_types,
        size_t n_tensors);

//
// device memory querying
//

// "memory" as in physical memory for a buffer type, in bytes
struct llama_memory_breakdown_data {
    size_t model   = 0; // memory allocated for the model
    size_t context = 0; // memory allocated for the context
    size_t compute = 0; // memory allocated for temporary compute buffers

    size_t total() const {
        return model + context + compute;
    }
};

struct llama_device_memory_data {
    int64_t total;
    int64_t free;
    llama_memory_breakdown_data mb;
};

// TODO: convert to C-style data structure
using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>;

LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);

LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);

LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);

// Set whether the context outputs nextn embeddings or not
// If masked == true,  output the embeddings only for the tokens with batch.logits != 0
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);

// Select which appended NextN block the DECODER_MTP graph runs (offset past
// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to
// chain multiple trained NextN heads. Default 0 (first head).
LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);

// mirrors:
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);

// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);

// Set whether the context outputs the input embeddings of a specific layer
LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);

// mirrors:
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);

LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);

//
// model/context data extraction
//

// returns pointer to the target-model layer indices
LLAMA_API const int32_t * llama_model_target_layer_ids  (const struct llama_model * model);
// returns the number of extracted layers from target model
LLAMA_API uint32_t        llama_model_target_layer_ids_n(const struct llama_model * model);

Line	Count	Source
1		#pragma once
2
3		// this is a staging header for new llama.cpp API
4		// breaking changes and C++ are allowed. everything here should be considered WIP
5		// try as much as possible to not include this header in the rest of the codebase
6
7		#include "llama.h"
8
9		#include <cstdint>
10		#include <map>
11
12		// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
13		LLAMA_API struct ggml_cgraph * llama_graph_reserve(
14		struct llama_context * ctx,
15		uint32_t n_tokens,
16		uint32_t n_seqs,
17		uint32_t n_outputs);
18
19		// Get the default ggml_type for a given ftype.
20		LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
21
22		struct quantize_state_impl;
23
24		LLAMA_API quantize_state_impl * llama_quant_init(
25		const llama_model * model,
26		const llama_model_quantize_params * params);
27
28		LLAMA_API void llama_quant_free(quantize_state_impl * qs);
29
30		// Descriptor for constructing a mock model for quantization testing.
31		struct llama_quant_model_desc {
32		const char * architecture;
33		uint32_t n_embd;
34		uint32_t n_ff;
35		uint32_t n_layer;
36		uint32_t n_head;
37		uint32_t n_head_kv;
38		uint32_t n_expert;
39		uint32_t n_embd_head_k;
40		uint32_t n_embd_head_v;
41		};
42
43		// Create a mock model from a metadata descriptor (for testing).
44		// The returned model must be freed with llama_model_free().
45		LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
46
47		// Returns true if this tensor should be quantized (based on name, dims, params).
48		LLAMA_API bool llama_quant_tensor_allows_quantization(
49		const quantize_state_impl * qs,
50		const ggml_tensor * tensor);
51
52		// Compute quantization type assignments for a list of tensors.
53		// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
54		// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
55		LLAMA_API void llama_quant_compute_types(
56		quantize_state_impl * qs,
57		llama_ftype ftype,
58		ggml_tensor ** tensors,
59		ggml_type * result_types,
60		size_t n_tensors);
61
62		//
63		// device memory querying
64		//
65
66		// "memory" as in physical memory for a buffer type, in bytes
67		struct llama_memory_breakdown_data {
68		size_t model = 0; // memory allocated for the model
69		size_t context = 0; // memory allocated for the context
70		size_t compute = 0; // memory allocated for temporary compute buffers
71
72	0	size_t total() const {
73	0	return model + context + compute;
74	0	}
75		};
76
77		struct llama_device_memory_data {
78		int64_t total;
79		int64_t free;
80		llama_memory_breakdown_data mb;
81		};
82
83		// TODO: convert to C-style data structure
84		using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>;
85
86		LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
87		LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
88
89		LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
90
91		LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
92
93		// Set whether the context outputs nextn embeddings or not
94		// If masked == true, output the embeddings only for the tokens with batch.logits != 0
95		// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
96		LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);
97
98		// Select which appended NextN block the DECODER_MTP graph runs (offset past
99		// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to
100		// chain multiple trained NextN heads. Default 0 (first head).
101		LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
102
103		// mirrors:
104		// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
105		LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
106
107		// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
108		LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
109
110		// Set whether the context outputs the input embeddings of a specific layer
111		LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
112
113		// mirrors:
114		// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
115		LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
116
117		LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
118
119		//
120		// model/context data extraction
121		//
122
123		// returns pointer to the target-model layer indices
124		LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model);
125		// returns the number of extracted layers from target model
126		LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model);

Coverage Report

Created: 2026-06-22 06:47