/src/llama.cpp/src/llama-ext.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | // this is a staging header for new llama.cpp API |
4 | | // breaking changes and C++ are allowed. everything here should be considered WIP |
5 | | // try as much as possible to not include this header in the rest of the codebase |
6 | | |
7 | | #include "llama.h" |
8 | | |
9 | | #include <cstdint> |
10 | | #include <map> |
11 | | |
12 | | // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. |
13 | | LLAMA_API struct ggml_cgraph * llama_graph_reserve( |
14 | | struct llama_context * ctx, |
15 | | uint32_t n_tokens, |
16 | | uint32_t n_seqs, |
17 | | uint32_t n_outputs); |
18 | | |
19 | | // Get the default ggml_type for a given ftype. |
20 | | LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype); |
21 | | |
22 | | struct quantize_state_impl; |
23 | | |
24 | | LLAMA_API quantize_state_impl * llama_quant_init( |
25 | | const llama_model * model, |
26 | | const llama_model_quantize_params * params); |
27 | | |
28 | | LLAMA_API void llama_quant_free(quantize_state_impl * qs); |
29 | | |
30 | | // Descriptor for constructing a mock model for quantization testing. |
31 | | struct llama_quant_model_desc { |
32 | | const char * architecture; |
33 | | uint32_t n_embd; |
34 | | uint32_t n_ff; |
35 | | uint32_t n_layer; |
36 | | uint32_t n_head; |
37 | | uint32_t n_head_kv; |
38 | | uint32_t n_expert; |
39 | | uint32_t n_embd_head_k; |
40 | | uint32_t n_embd_head_v; |
41 | | }; |
42 | | |
43 | | // Create a mock model from a metadata descriptor (for testing). |
44 | | // The returned model must be freed with llama_model_free(). |
45 | | LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc); |
46 | | |
47 | | // Returns true if this tensor should be quantized (based on name, dims, params). |
48 | | LLAMA_API bool llama_quant_tensor_allows_quantization( |
49 | | const quantize_state_impl * qs, |
50 | | const ggml_tensor * tensor); |
51 | | |
52 | | // Compute quantization type assignments for a list of tensors. |
53 | | // All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter). |
54 | | // result_types: caller-allocated array of n_tensors elements, filled with assigned types. |
55 | | LLAMA_API void llama_quant_compute_types( |
56 | | quantize_state_impl * qs, |
57 | | llama_ftype ftype, |
58 | | ggml_tensor ** tensors, |
59 | | ggml_type * result_types, |
60 | | size_t n_tensors); |
61 | | |
62 | | // |
63 | | // device memory querying |
64 | | // |
65 | | |
66 | | // "memory" as in physical memory for a buffer type, in bytes |
67 | | struct llama_memory_breakdown_data { |
68 | | size_t model = 0; // memory allocated for the model |
69 | | size_t context = 0; // memory allocated for the context |
70 | | size_t compute = 0; // memory allocated for temporary compute buffers |
71 | | |
72 | 0 | size_t total() const { |
73 | 0 | return model + context + compute; |
74 | 0 | } |
75 | | }; |
76 | | |
77 | | struct llama_device_memory_data { |
78 | | int64_t total; |
79 | | int64_t free; |
80 | | llama_memory_breakdown_data mb; |
81 | | }; |
82 | | |
83 | | // TODO: convert to C-style data structure |
84 | | using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>; |
85 | | |
86 | | LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model); |
87 | | LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model); |
88 | | |
89 | | LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i); |
90 | | |
91 | | LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx); |
92 | | |
93 | | // Set whether the context outputs nextn embeddings or not |
94 | | // If masked == true, output the embeddings only for the tokens with batch.logits != 0 |
95 | | // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits |
96 | | LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked); |
97 | | |
98 | | // Select which appended NextN block the DECODER_MTP graph runs (offset past |
99 | | // the trunk: il = n_layer() + offset). Used by the speculative NextN driver to |
100 | | // chain multiple trained NextN heads. Default 0 (first head). |
101 | | LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset); |
102 | | |
103 | | // mirrors: |
104 | | // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); |
105 | | LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); |
106 | | |
107 | | // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); |
108 | | LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); |
109 | | |
110 | | // Set whether the context outputs the input embeddings of a specific layer |
111 | | LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value); |
112 | | |
113 | | // mirrors: |
114 | | // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); |
115 | | LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid); |
116 | | |
117 | | LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); |
118 | | |
119 | | // |
120 | | // model/context data extraction |
121 | | // |
122 | | |
123 | | // returns pointer to the target-model layer indices |
124 | | LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model); |
125 | | // returns the number of extracted layers from target model |
126 | | LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model); |