/src/llama.cpp/src/llama-model-loader.h

Source
#pragma once

#include "llama.h"

#include "llama-impl.h"
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-mmap.h"

#include "ggml-cpp.h"

#include <cstddef>
#include <cstring>
#include <map>
#include <stdexcept>
#include <unordered_map>

using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;

// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;

enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
    GGUF_FILE_VERSION_V2 = 2,
    GGUF_FILE_VERSION_V3 = 3,
};

const char * llama_file_version_name(llama_fver version);

struct llama_model_loader {
    // Holds information on a model weight
    struct llama_tensor_weight {
        uint16_t  idx; // source file index
        size_t   offs; // tensor data offset in the original file

        ggml_tensor * tensor;

        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
            if (tensor_idx < 0) {
                throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
            }

            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
            }
        }
    };

    // custom comparator to sort weights more nicely by layer
    struct weight_name_comparer {
        bool operator()(const std::string & a, const std::string & b) const {
            int a_layer = -1;
            int b_layer = -1;
            sscanf(a.c_str(), "blk.%d.", &a_layer);
            sscanf(b.c_str(), "blk.%d.", &b_layer);
            if (a_layer != b_layer) {
                return a_layer < b_layer;
            }
            return a < b;
        }
    };

    static const int TENSOR_NOT_REQUIRED    = 1 << 0;
    static const int TENSOR_DUPLICATED      = 1 << 1;
    static const int TENSOR_SKIP            = 1 << 2;
    static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;

    int n_kv      = 0;
    int n_tensors = 0;
    int n_created = 0;

    uint64_t n_elements = 0;
    size_t   n_bytes    = 0;

    bool use_mmap = false;
    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;

    llama_files files;
    llama_ftype ftype;
    llama_fver  fver;

    llama_mmaps mappings;

    std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
    std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
    const llama_model_tensor_buft_override * tensor_buft_overrides;

    gguf_context_ptr metadata_ptr;
    struct gguf_context * metadata; // either metadata_ptr.get() or externally set
    llama_model_set_tensor_data_t set_tensor_data;
    void * set_tensor_data_ud;
    std::vector<ggml_context_ptr> contexts;

    std::string arch_name;
    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);

    size_t size_done = 0;
    size_t size_data = 0;
    std::vector<std::pair<size_t, size_t>> mmaps_used;

    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
    struct ggml_backend_buft_comparator {
        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
        }
    };

    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;

    // track tensors that had to be moved for debugging:
    size_t n_tensors_moved = 0;
    std::string first_tensor_moved_name;
    std::string first_tensor_moved_type_name;
    ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
    ggml_backend_buffer_type_t first_moved_to_buft = nullptr;

    llama_model_loader(
        struct gguf_context * metadata,
        llama_model_set_tensor_data_t set_tensor_data,
        void * set_tensor_data_ud,
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        FILE * file,
        bool use_mmap,
        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);

    template<typename T>
    typename std::enable_if<std::is_integral<T>::value, bool>::type
    get_arr_n(const std::string & key, T & result, bool required = true);

    template<typename T>
    typename std::enable_if<std::is_integral<T>::value, bool>::type
    get_arr_n(enum llm_kv kid, T & result, bool required = true);

    template<typename T>
    bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);

    template<typename T, size_t N_MAX>
    bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);

    template<typename T>
    bool get_arr(enum llm_kv kid, T & result, bool required = true);

    template<typename T>
    bool get_key(const std::string & key, T & result, bool required = true);

    template<typename T>
    bool get_key(enum llm_kv kid, T & result, bool required = true);

    template<typename T, size_t N_MAX>
    bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);

    template<typename T>
    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);

    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);

    std::string get_arch_name() const;

    enum llm_arch get_arch() const;

    const llama_tensor_weight * get_weight(const char * name) const;

    const llama_tensor_weight & require_weight(const char * name) const;

    struct ggml_tensor * get_tensor_meta(const char * name) const;

    struct ggml_tensor * require_tensor_meta(const std::string & name) const;

    const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;

    struct ggml_tensor * create_tensor(
        const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
        const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);

    struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);

    void done_getting_tensors(bool partial = false) const;

    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);

    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;

    // for backwards compatibility, does not support ggml-backend
    void load_data_for(struct ggml_tensor * cur) const;

    // Returns false if cancelled by progress_callback
    bool load_all_data(
            struct ggml_context * ctx,
            llama_buf_map & bufs,
            llama_mlocks * lmlocks,
            llama_progress_callback progress_callback,
            void * progress_callback_user_data);

    std::string ftype_name() const;

    void print_info() const;
};

Coverage Report

Created: 2026-06-13 06:23

Line	Count	Source
1		#pragma once
2
3		#include "llama.h"
4
5		#include "llama-impl.h"
6		#include "llama-arch.h"
7		#include "llama-hparams.h"
8		#include "llama-mmap.h"
9
10		#include "ggml-cpp.h"
11
12		#include <cstddef>
13		#include <cstring>
14		#include <map>
15		#include <stdexcept>
16		#include <unordered_map>
17
18		using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
19
20		// lists of buffer types used for each layer
21		using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
22
23		enum llama_fver {
24		GGUF_FILE_VERSION_V1 = 1,
25		GGUF_FILE_VERSION_V2 = 2,
26		GGUF_FILE_VERSION_V3 = 3,
27		};
28
29		const char * llama_file_version_name(llama_fver version);
30
31		struct llama_model_loader {
32		// Holds information on a model weight
33		struct llama_tensor_weight {
34		uint16_t idx; // source file index
35		size_t offs; // tensor data offset in the original file
36
37		ggml_tensor * tensor;
38
39	0	llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
40	0	const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
41	0	if (tensor_idx < 0) {
42	0	throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
43	0	}
44
45	0	offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
46	0	if (offs + ggml_nbytes(tensor) < offs \|\| offs + ggml_nbytes(tensor) > file->size()) {
47	0	throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
48	0	}
49	0	}
50		};
51
52		// custom comparator to sort weights more nicely by layer
53		struct weight_name_comparer {
54	0	bool operator()(const std::string & a, const std::string & b) const {
55	0	int a_layer = -1;
56	0	int b_layer = -1;
57	0	sscanf(a.c_str(), "blk.%d.", &a_layer);
58	0	sscanf(b.c_str(), "blk.%d.", &b_layer);
59	0	if (a_layer != b_layer) {
60	0	return a_layer < b_layer;
61	0	}
62	0	return a < b;
63	0	}
64		};
65
66		static const int TENSOR_NOT_REQUIRED = 1 << 0;
67		static const int TENSOR_DUPLICATED = 1 << 1;
68		static const int TENSOR_SKIP = 1 << 2;
69		static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
70
71		int n_kv = 0;
72		int n_tensors = 0;
73		int n_created = 0;
74
75		uint64_t n_elements = 0;
76		size_t n_bytes = 0;
77
78		bool use_mmap = false;
79		bool use_direct_io = false;
80		bool check_tensors;
81		bool no_alloc;
82
83		llama_files files;
84		llama_ftype ftype;
85		llama_fver fver;
86
87		llama_mmaps mappings;
88
89		std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
90		std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
91		const llama_model_tensor_buft_override * tensor_buft_overrides;
92
93		gguf_context_ptr metadata_ptr;
94		struct gguf_context * metadata; // either metadata_ptr.get() or externally set
95		llama_model_set_tensor_data_t set_tensor_data;
96		void * set_tensor_data_ud;
97		std::vector<ggml_context_ptr> contexts;
98
99		std::string arch_name;
100		LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
101
102		size_t size_done = 0;
103		size_t size_data = 0;
104		std::vector<std::pair<size_t, size_t>> mmaps_used;
105
106		// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
107		struct ggml_backend_buft_comparator {
108	0	bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
109	0	return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
110	0	}
111		};
112
113		std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
114
115		// track tensors that had to be moved for debugging:
116		size_t n_tensors_moved = 0;
117		std::string first_tensor_moved_name;
118		std::string first_tensor_moved_type_name;
119		ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
120		ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
121
122		llama_model_loader(
123		struct gguf_context * metadata,
124		llama_model_set_tensor_data_t set_tensor_data,
125		void * set_tensor_data_ud,
126		const std::string & fname,
127		std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
128		FILE * file,
129		bool use_mmap,
130		bool use_direct_io,
131		bool check_tensors,
132		bool no_alloc,
133		const llama_model_kv_override * param_overrides_p,
134		const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
135
136		template<typename T>
137		typename std::enable_if<std::is_integral<T>::value, bool>::type
138		get_arr_n(const std::string & key, T & result, bool required = true);
139
140		template<typename T>
141		typename std::enable_if<std::is_integral<T>::value, bool>::type
142		get_arr_n(enum llm_kv kid, T & result, bool required = true);
143
144		template<typename T>
145		bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
146
147		template<typename T, size_t N_MAX>
148		bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
149
150		template<typename T>
151		bool get_arr(enum llm_kv kid, T & result, bool required = true);
152
153		template<typename T>
154		bool get_key(const std::string & key, T & result, bool required = true);
155
156		template<typename T>
157		bool get_key(enum llm_kv kid, T & result, bool required = true);
158
159		template<typename T, size_t N_MAX>
160		bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
161
162		template<typename T>
163		bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
164
165		bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
166
167		std::string get_arch_name() const;
168
169		enum llm_arch get_arch() const;
170
171		const llama_tensor_weight * get_weight(const char * name) const;
172
173		const llama_tensor_weight & require_weight(const char * name) const;
174
175		struct ggml_tensor * get_tensor_meta(const char * name) const;
176
177		struct ggml_tensor * require_tensor_meta(const std::string & name) const;
178
179		const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
180
181		struct ggml_tensor * create_tensor(
182		const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
183		const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
184
185		struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
186
187		void done_getting_tensors(bool partial = false) const;
188
189		void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
190
191		void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
192
193		// for backwards compatibility, does not support ggml-backend
194		void load_data_for(struct ggml_tensor * cur) const;
195
196		// Returns false if cancelled by progress_callback
197		bool load_all_data(
198		struct ggml_context * ctx,
199		llama_buf_map & bufs,
200		llama_mlocks * lmlocks,
201		llama_progress_callback progress_callback,
202		void * progress_callback_user_data);
203
204		std::string ftype_name() const;
205
206		void print_info() const;
207		};