/src/llama.cpp/src/llama-model-loader.cpp
Line | Count | Source |
1 | | #include "llama-model-loader.h" |
2 | | |
3 | | #include "ggml-alloc.h" |
4 | | #include "ggml.h" |
5 | | #include "gguf.h" |
6 | | #include "llama-hparams.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <array> |
10 | | #include <cinttypes> |
11 | | #include <cstdint> |
12 | | #include <cstring> |
13 | | #include <future> |
14 | | #include <regex> |
15 | | |
16 | | static const size_t kiB = 1024; |
17 | | static const size_t MiB = 1024*kiB; |
18 | | static const size_t GiB = 1024*MiB; |
19 | | |
20 | 0 | const char * llama_file_version_name(llama_fver version) { |
21 | 0 | switch (version) { |
22 | 0 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; |
23 | 0 | case GGUF_FILE_VERSION_V2: return "GGUF V2"; |
24 | 0 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; |
25 | 0 | } |
26 | | |
27 | 0 | return "unknown"; |
28 | 0 | } |
29 | | |
30 | 0 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
31 | 0 | if (ftype & LLAMA_FTYPE_GUESSED) { |
32 | 0 | return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; |
33 | 0 | } |
34 | | |
35 | 0 | switch (ftype) { |
36 | 0 | case LLAMA_FTYPE_ALL_F32: return "all F32"; |
37 | 0 | case LLAMA_FTYPE_MOSTLY_F16: return "F16"; |
38 | 0 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |
39 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |
40 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |
41 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |
42 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |
43 | 0 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |
44 | 0 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; |
45 | 0 | case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4"; |
46 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; |
47 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |
48 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |
49 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |
50 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |
51 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |
52 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |
53 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |
54 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |
55 | 0 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; |
56 | 0 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; |
57 | 0 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; |
58 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; |
59 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |
60 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; |
61 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; |
62 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |
63 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; |
64 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; |
65 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; |
66 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |
67 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |
68 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; |
69 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; |
70 | | |
71 | 0 | default: return "unknown, may not work"; |
72 | 0 | } |
73 | 0 | } |
74 | | |
75 | | // return a list of splits for a given path |
76 | | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
77 | 0 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
78 | 0 | std::vector<std::string> paths; |
79 | 0 | std::string split_prefix; |
80 | 0 | std::vector<char> buf(llama_path_max(), 0); |
81 | |
|
82 | 0 | { |
83 | 0 | int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); |
84 | 0 | if (!ret) { |
85 | 0 | throw std::runtime_error(format("invalid split file name: %s", path.c_str())); |
86 | 0 | } |
87 | 0 | split_prefix = std::string(buf.data(), ret); |
88 | 0 | } |
89 | | |
90 | 0 | if (split_prefix.empty()) { |
91 | 0 | throw std::runtime_error(format("invalid split file: %s", path.c_str())); |
92 | 0 | } |
93 | | |
94 | 0 | for (int idx = 0; idx < n_split; ++idx) { |
95 | 0 | int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); |
96 | 0 | paths.push_back(std::string(buf.data(), ret)); |
97 | 0 | } |
98 | |
|
99 | 0 | return paths; |
100 | 0 | } |
101 | | |
102 | | namespace GGUFMeta { |
103 | | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
104 | | struct GKV_Base_Type { |
105 | | static constexpr gguf_type gt = gt_; |
106 | | |
107 | 0 | static T getter(const gguf_context * ctx, const int kid) { |
108 | 0 | return gfun(ctx, kid); |
109 | 0 | } Unexecuted instantiation: GGUFMeta::GKV_Base_Type<bool, (gguf_type)7, &gguf_get_val_bool>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<float, (gguf_type)6, &gguf_get_val_f32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned int, (gguf_type)4, &gguf_get_val_u32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned short, (gguf_type)2, &gguf_get_val_u16>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<int, (gguf_type)5, &gguf_get_val_i32>::getter(gguf_context const*, int) |
110 | | }; |
111 | | |
112 | | template<typename T> struct GKV_Base; |
113 | | |
114 | | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
115 | | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
116 | | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
117 | | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
118 | | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
119 | | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
120 | | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
121 | | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
122 | | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
123 | | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
124 | | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
125 | | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
126 | | |
127 | | template<> struct GKV_Base<std::string> { |
128 | | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
129 | | |
130 | 0 | static std::string getter(const gguf_context * ctx, const int kid) { |
131 | 0 | return gguf_get_val_str(ctx, kid); |
132 | 0 | } |
133 | | }; |
134 | | |
135 | | struct ArrayInfo { |
136 | | const gguf_type gt; |
137 | | const size_t length; |
138 | | const void * data; |
139 | | }; |
140 | | |
141 | | template<> struct GKV_Base<ArrayInfo> { |
142 | | public: |
143 | | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
144 | 0 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
145 | 0 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); |
146 | 0 | return ArrayInfo { |
147 | 0 | arr_type, |
148 | 0 | size_t(gguf_get_arr_n(ctx, k)), |
149 | 0 | arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), |
150 | 0 | }; |
151 | 0 | } |
152 | | }; |
153 | | |
154 | | template<typename T> |
155 | | class GKV : public GKV_Base<T> { |
156 | | GKV() = delete; |
157 | | |
158 | | public: |
159 | 0 | static T get_kv(const gguf_context * ctx, const int k) { |
160 | 0 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); |
161 | |
|
162 | 0 | if (kt != GKV::gt) { |
163 | 0 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", |
164 | 0 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); |
165 | 0 | } |
166 | 0 | return GKV::getter(ctx, k); |
167 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<bool>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<float>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<int>::get_kv(gguf_context const*, int) |
168 | | |
169 | 0 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
170 | 0 | switch (ty) { |
171 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; |
172 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; |
173 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; |
174 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; |
175 | 0 | } |
176 | 0 | return "unknown"; |
177 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<float>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<int>::override_type_to_str(llama_model_kv_override_type) |
178 | | |
179 | 0 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
180 | 0 | if (!ovrd) { return false; } |
181 | 0 | if (ovrd->tag == expected_type) { |
182 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", |
183 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
184 | 0 | switch (ovrd->tag) { |
185 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
186 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); |
187 | 0 | } break; |
188 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
189 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); |
190 | 0 | } break; |
191 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
192 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); |
193 | 0 | } break; |
194 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
195 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); |
196 | 0 | } break; |
197 | 0 | default: |
198 | | // Shouldn't be possible to end up here, but just in case... |
199 | 0 | throw std::runtime_error( |
200 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", |
201 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); |
202 | 0 | } |
203 | 0 | return true; |
204 | 0 | } |
205 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", |
206 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
207 | 0 | return false; |
208 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) |
209 | | |
210 | | template<typename OT> |
211 | | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
212 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
213 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
214 | 0 | target = ovrd->val_bool; |
215 | 0 | return true; |
216 | 0 | } |
217 | 0 | return false; |
218 | 0 | } |
219 | | |
220 | | template<typename OT> |
221 | | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
222 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
223 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
224 | 0 | target = ovrd->val_i64; |
225 | 0 | return true; |
226 | 0 | } |
227 | 0 | return false; |
228 | 0 | } Unexecuted instantiation: _ZN8GGUFMeta3GKVIjE12try_overrideIjEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVItE12try_overrideItEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVIiE12try_overrideIiEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override |
229 | | |
230 | | template<typename OT> |
231 | | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
232 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
233 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
234 | 0 | target = ovrd->val_f64; |
235 | 0 | return true; |
236 | 0 | } |
237 | 0 | return false; |
238 | 0 | } |
239 | | |
240 | | template<typename OT> |
241 | | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
242 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
243 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
244 | 0 | target = ovrd->val_str; |
245 | 0 | return true; |
246 | 0 | } |
247 | 0 | return false; |
248 | 0 | } |
249 | | |
250 | 0 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
251 | 0 | if (try_override<T>(target, ovrd)) { |
252 | 0 | return true; |
253 | 0 | } |
254 | 0 | if (k < 0) { return false; } |
255 | 0 | target = get_kv(ctx, k); |
256 | 0 | return true; |
257 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, int, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, int, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, int, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, int, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, int, int&, llama_model_kv_override const*) |
258 | | |
259 | 0 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
260 | 0 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
261 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, char const*, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, char const*, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, char const*, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, char const*, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, char const*, int&, llama_model_kv_override const*) |
262 | | |
263 | 0 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
264 | 0 | return set(ctx, key.c_str(), target, ovrd); |
265 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, llama_model_kv_override const*) |
266 | | }; |
267 | | } |
268 | | |
269 | | template<typename T> |
270 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
271 | 0 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
272 | 0 | const int kid = gguf_find_key(metadata, key.c_str()); |
273 | |
|
274 | 0 | if (kid < 0) { |
275 | 0 | if (required) { |
276 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
277 | 0 | } |
278 | 0 | return false; |
279 | 0 | } |
280 | | |
281 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
282 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid); |
283 | | |
284 | |
|
285 | 0 | result = arr_info.length; |
286 | 0 | return true; |
287 | 0 | } |
288 | | |
289 | | template<typename T> |
290 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
291 | 0 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
292 | 0 | return get_arr_n(llm_kv(kid), result, required); |
293 | 0 | } |
294 | | |
295 | | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
296 | | |
297 | | template<typename T> |
298 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
299 | 0 | const gguf_context * ctx = metadata; |
300 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
301 | |
|
302 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
303 | 0 | if (required) { |
304 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
305 | 0 | } |
306 | 0 | return false; |
307 | 0 | } |
308 | | |
309 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
310 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
311 | |
|
312 | 0 | switch (arr_info.gt) { |
313 | 0 | case GGUF_TYPE_UINT32: |
314 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
315 | 0 | (std::is_same<T, uint32_t>::value)); break; |
316 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
317 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
318 | 0 | default: |
319 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
320 | 0 | } |
321 | | |
322 | 0 | if constexpr (std::is_same<T, std::string>::value) { |
323 | 0 | const size_t n_items = gguf_get_arr_n(ctx, kid); |
324 | 0 | result.clear(); |
325 | |
|
326 | 0 | for (size_t i = 0; i < n_items; i++) { |
327 | 0 | const T value = gguf_get_arr_str(ctx, kid, i); |
328 | 0 | result.emplace_back(value); |
329 | 0 | } |
330 | | } else { |
331 | | result.resize(arr_info.length); |
332 | | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
333 | | } |
334 | |
|
335 | 0 | return true; |
336 | 0 | } |
337 | | |
338 | | template<typename T, size_t N_MAX> |
339 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
340 | 0 | const gguf_context * ctx = metadata; |
341 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
342 | |
|
343 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
344 | 0 | if (required) { |
345 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
346 | 0 | } |
347 | 0 | return false; |
348 | 0 | } |
349 | | |
350 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
351 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
352 | |
|
353 | 0 | switch (arr_info.gt) { |
354 | 0 | case GGUF_TYPE_BOOL: |
355 | 0 | case GGUF_TYPE_UINT32: |
356 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
357 | 0 | (std::is_same<T, uint32_t>::value)); break; |
358 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
359 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
360 | 0 | default: |
361 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
362 | 0 | } |
363 | | |
364 | 0 | if (arr_info.length > N_MAX) { |
365 | 0 | throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
366 | 0 | } |
367 | | |
368 | | if constexpr (std::is_same<T, std::string>::value) { |
369 | | const size_t n_items = gguf_get_arr_n(ctx, kid); |
370 | | |
371 | | for (size_t i = 0; i < n_items; i++) { |
372 | | const T value = gguf_get_arr_str(ctx, kid, i); |
373 | | result[i] = value; |
374 | | } |
375 | 0 | } else { |
376 | 0 | if (arr_info.gt == GGUF_TYPE_BOOL) { |
377 | 0 | std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) { |
378 | 0 | return static_cast<T>(x); |
379 | 0 | }); Unexecuted instantiation: llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool)::{lambda(bool)#1}::operator()(bool) constUnexecuted instantiation: llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool)::{lambda(bool)#1}::operator()(bool) constUnexecuted instantiation: llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool)::{lambda(bool)#1}::operator()(bool) const |
380 | 0 | } else { |
381 | 0 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
382 | 0 | } |
383 | 0 | } |
384 | |
|
385 | 0 | return true; |
386 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool) |
387 | | |
388 | | template<typename T> |
389 | 0 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
390 | 0 | return get_arr(llm_kv(kid), result, required); |
391 | 0 | } |
392 | | |
393 | | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
394 | | |
395 | | template<typename T> |
396 | 0 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
397 | 0 | auto it = kv_overrides.find(key); |
398 | |
|
399 | 0 | const struct llama_model_kv_override * override = |
400 | 0 | it != kv_overrides.end() ? &it->second : nullptr; |
401 | |
|
402 | 0 | const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override); |
403 | |
|
404 | 0 | if (required && !found) { |
405 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
406 | 0 | } |
407 | | |
408 | 0 | return found; |
409 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned short>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, bool) |
410 | | |
411 | | template<typename T> |
412 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
413 | 0 | return get_key(llm_kv(kid), result, required); |
414 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(llm_kv, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(llm_kv, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(llm_kv, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(llm_kv, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) |
415 | | |
416 | | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
417 | | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
418 | | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
419 | | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
420 | | |
421 | | template<> |
422 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
423 | 0 | uint32_t tmp; |
424 | 0 | const bool found = get_key(kid, tmp, required); |
425 | 0 | if (found) { |
426 | 0 | result = (enum llama_pooling_type) tmp; |
427 | 0 | } else { |
428 | 0 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
429 | 0 | } |
430 | 0 | return found; |
431 | 0 | } |
432 | | |
433 | | // get array of n <= N_MAX elements, or a single element repeated n times |
434 | | template<typename T, size_t N_MAX> |
435 | 0 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
436 | 0 | const int kid = gguf_find_key(metadata, key.c_str()); |
437 | |
|
438 | 0 | if (kid < 0) { |
439 | 0 | if (required) { |
440 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
441 | 0 | } |
442 | 0 | return false; |
443 | 0 | } |
444 | | |
445 | 0 | if (n > N_MAX) { |
446 | 0 | throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
447 | 0 | } |
448 | | |
449 | 0 | if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) { |
450 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
451 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid); |
452 | |
|
453 | 0 | if (n != arr_info.length) { |
454 | 0 | throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); |
455 | 0 | } |
456 | | |
457 | 0 | return get_arr(key, result, required); |
458 | 0 | } |
459 | | |
460 | 0 | T value; |
461 | |
|
462 | 0 | bool ok = get_key(key, value, required); |
463 | 0 | if (!ok) { |
464 | 0 | return false; |
465 | 0 | } |
466 | | |
467 | 0 | for (uint32_t i = 0; i < n; i++) { |
468 | 0 | result[i] = value; |
469 | 0 | } |
470 | |
|
471 | 0 | return true; |
472 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, unsigned int, bool) |
473 | | |
474 | | template<typename T> |
475 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
476 | 0 | return get_key_or_arr(llm_kv(kid), result, n, required); |
477 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<int, 4ul> >(llm_kv, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<unsigned int, 512ul> >(llm_kv, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<float, 512ul> >(llm_kv, std::__1::array<float, 512ul>&, unsigned int, bool) |
478 | | |
479 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) { |
480 | 0 | const std::string key = llm_kv(kid); |
481 | |
|
482 | 0 | const int id = gguf_find_key(metadata, key.c_str()); |
483 | |
|
484 | 0 | if (id < 0) { |
485 | 0 | if (required) { |
486 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
487 | 0 | } |
488 | 0 | return false; |
489 | 0 | } |
490 | | |
491 | | // throw and error if type is an array |
492 | 0 | if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) { |
493 | 0 | if (required) { |
494 | 0 | throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str())); |
495 | 0 | } |
496 | 0 | return false; |
497 | 0 | } |
498 | | |
499 | 0 | return get_key(key, result, required); |
500 | 0 | } |
501 | | |
502 | | // TODO: this is not very clever - figure out something better |
503 | | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
504 | | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
505 | | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
506 | | |
507 | | |
508 | | llama_model_loader::llama_model_loader( |
509 | | struct gguf_context * meta, |
510 | | llama_model_set_tensor_data_t set_tensor_data, |
511 | | void * set_tensor_data_ud, |
512 | | const std::string & fname, |
513 | | std::vector<std::string> & splits, |
514 | | bool use_mmap, |
515 | | bool use_direct_io, |
516 | | bool check_tensors, |
517 | | bool no_alloc, |
518 | | const llama_model_kv_override * param_overrides_p, |
519 | | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) |
520 | 0 | : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) { |
521 | 0 | int trace = 0; |
522 | 0 | if (getenv("LLAMA_TRACE")) { |
523 | 0 | trace = atoi(getenv("LLAMA_TRACE")); |
524 | 0 | } |
525 | |
|
526 | 0 | if (param_overrides_p != nullptr) { |
527 | 0 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
528 | 0 | kv_overrides.insert({std::string(p->key), *p}); |
529 | 0 | } |
530 | 0 | } |
531 | |
|
532 | 0 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
533 | |
|
534 | 0 | if (!fname.empty()) { |
535 | | // Load the main GGUF |
536 | 0 | struct ggml_context * ctx = NULL; |
537 | 0 | struct gguf_init_params params = { |
538 | 0 | /*.no_alloc = */ true, |
539 | 0 | /*.ctx = */ &ctx, |
540 | 0 | }; |
541 | |
|
542 | 0 | metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params)); |
543 | 0 | metadata = metadata_ptr.get(); |
544 | 0 | if (metadata == nullptr) { |
545 | 0 | throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); |
546 | 0 | } |
547 | | |
548 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
549 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
550 | |
|
551 | 0 | files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); |
552 | 0 | contexts.emplace_back(ctx); |
553 | |
|
554 | 0 | if (use_mmap && use_direct_io) { |
555 | 0 | if (files.back()->has_direct_io()) { |
556 | 0 | LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); |
557 | 0 | use_mmap = false; |
558 | 0 | } else { |
559 | 0 | LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); |
560 | 0 | use_direct_io = false; |
561 | | |
562 | | // reopen file using std::fopen for mmap |
563 | 0 | files.pop_back(); |
564 | 0 | files.emplace_back(new llama_file(fname.c_str(), "rb", false)); |
565 | 0 | } |
566 | 0 | } |
567 | | |
568 | | // Save tensors data offset of the main file. |
569 | | // For subsidiary files, `meta` tensor data offset must not be used, |
570 | | // so we build a unified tensors index for weights. |
571 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
572 | 0 | std::string tensor_name = std::string(cur->name); |
573 | | // make sure there is no duplicated tensor names |
574 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
575 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
576 | 0 | } |
577 | 0 | n_elements += ggml_nelements(cur); |
578 | 0 | n_bytes += ggml_nbytes(cur); |
579 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); |
580 | 0 | } |
581 | 0 | uint16_t n_split = 0; |
582 | 0 | get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); |
583 | | |
584 | | // Load additional GGML contexts |
585 | 0 | if (n_split > 1) { |
586 | | // make sure the main file is loaded first |
587 | 0 | uint16_t idx = 0; |
588 | 0 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
589 | 0 | get_key(kv_split_no, idx); |
590 | 0 | if (idx != 0) { |
591 | 0 | throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); |
592 | 0 | } |
593 | | |
594 | | // generate list of splits if needed |
595 | 0 | if (splits.empty()) { |
596 | 0 | splits = llama_get_list_splits(fname, idx, n_split); |
597 | 0 | } |
598 | | |
599 | | // in case user give a custom list of splits, check if it matches the expected number |
600 | 0 | if (n_split != (uint16_t)splits.size()) { |
601 | 0 | throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); |
602 | 0 | } |
603 | | |
604 | 0 | if (trace > 0) { |
605 | 0 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); |
606 | 0 | } |
607 | | |
608 | | // load other splits |
609 | 0 | for (idx = 1; idx < n_split; idx++) { |
610 | 0 | const char * fname_split = splits[idx].c_str(); |
611 | |
|
612 | 0 | struct gguf_init_params split_params = { |
613 | 0 | /*.no_alloc = */ true, |
614 | 0 | /*.ctx = */ &ctx, |
615 | 0 | }; |
616 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; |
617 | 0 | if (!ctx_gguf) { |
618 | 0 | throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); |
619 | 0 | } |
620 | | |
621 | | // check idx |
622 | 0 | { |
623 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); |
624 | 0 | if (kid < 0) { |
625 | 0 | throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); |
626 | 0 | } |
627 | 0 | int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); |
628 | 0 | if (idx_gguf != idx) { |
629 | 0 | throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); |
630 | 0 | } |
631 | 0 | } |
632 | | |
633 | 0 | files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); |
634 | 0 | contexts.emplace_back(ctx); |
635 | | |
636 | | // Save tensors data offset info of the shard. |
637 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
638 | 0 | std::string tensor_name = std::string(cur->name); |
639 | | // make sure there is no duplicated tensor names |
640 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
641 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
642 | 0 | } |
643 | 0 | n_elements += ggml_nelements(cur); |
644 | 0 | n_bytes += ggml_nbytes(cur); |
645 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
646 | 0 | } |
647 | 0 | } |
648 | | |
649 | 0 | get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); |
650 | | |
651 | | // sanity check |
652 | 0 | { |
653 | 0 | const int n_tensors_loaded = (int) weights_map.size(); |
654 | 0 | if (n_tensors != n_tensors_loaded) { |
655 | 0 | throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); |
656 | 0 | } |
657 | 0 | } |
658 | | |
659 | 0 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); |
660 | 0 | } |
661 | 0 | } else { |
662 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
663 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
664 | 0 | } |
665 | | |
666 | 0 | n_kv = gguf_get_n_kv(metadata); |
667 | 0 | n_tensors = weights_map.size(); |
668 | |
|
669 | 0 | fver = (enum llama_fver) gguf_get_version(metadata); |
670 | |
|
671 | 0 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", |
672 | 0 | __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); |
673 | | |
674 | | // determine file type based on the number of tensors for each quantization and print meta data |
675 | | // TODO: make optional |
676 | 0 | { |
677 | 0 | std::map<enum ggml_type, uint32_t> n_type; |
678 | |
|
679 | 0 | uint32_t n_type_max = 0; |
680 | 0 | enum ggml_type type_max = GGML_TYPE_F32; |
681 | |
|
682 | 0 | for (const auto & it : weights_map) { |
683 | 0 | const llama_tensor_weight & w = it.second; |
684 | 0 | const ggml_tensor * tensor = w.tensor; |
685 | |
|
686 | 0 | enum ggml_type type = tensor->type; |
687 | |
|
688 | 0 | n_type[type]++; |
689 | |
|
690 | 0 | if (n_type_max < n_type[type]) { |
691 | 0 | n_type_max = n_type[type]; |
692 | 0 | type_max = type; |
693 | 0 | } |
694 | |
|
695 | 0 | if (trace > 0) { |
696 | 0 | const uint16_t sid = w.idx; |
697 | 0 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, |
698 | 0 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
699 | 0 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
700 | 0 | } |
701 | 0 | } |
702 | |
|
703 | 0 | switch (type_max) { |
704 | 0 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
705 | 0 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
706 | 0 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
707 | 0 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
708 | 0 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
709 | 0 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
710 | 0 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
711 | 0 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
712 | 0 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
713 | 0 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
714 | 0 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
715 | 0 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
716 | 0 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
717 | 0 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
718 | 0 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
719 | 0 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
720 | 0 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
721 | 0 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
722 | 0 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
723 | 0 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
724 | 0 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
725 | 0 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
726 | 0 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
727 | 0 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
728 | 0 | case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break; |
729 | 0 | default: |
730 | 0 | { |
731 | 0 | LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); |
732 | 0 | ftype = LLAMA_FTYPE_ALL_F32; |
733 | 0 | } break; |
734 | 0 | } |
735 | | |
736 | | // this is a way to mark that we have "guessed" the file type |
737 | 0 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
738 | |
|
739 | 0 | { |
740 | 0 | uint32_t ftype_val = 0; |
741 | 0 | if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { |
742 | 0 | ftype = (llama_ftype) ftype_val; |
743 | 0 | } |
744 | 0 | } |
745 | |
|
746 | 0 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); |
747 | |
|
748 | 0 | for (int i = 0; i < n_kv; i++) { |
749 | 0 | const char * name = gguf_get_key(metadata, i); |
750 | 0 | const enum gguf_type type = gguf_get_kv_type(metadata, i); |
751 | 0 | const std::string type_name = |
752 | 0 | type == GGUF_TYPE_ARRAY |
753 | 0 | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i)) |
754 | 0 | : gguf_type_name(type); |
755 | |
|
756 | 0 | std::string value = gguf_kv_to_str(metadata, i); |
757 | 0 | const size_t MAX_VALUE_LEN = 40; |
758 | 0 | if (value.size() > MAX_VALUE_LEN) { |
759 | 0 | value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); |
760 | 0 | } |
761 | 0 | replace_all(value, "\n", "\\n"); |
762 | |
|
763 | 0 | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); |
764 | 0 | } |
765 | | |
766 | | // print type counts |
767 | 0 | for (auto & kv : n_type) { |
768 | 0 | if (kv.second == 0) { |
769 | 0 | continue; |
770 | 0 | } |
771 | | |
772 | 0 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); |
773 | 0 | } |
774 | 0 | } |
775 | | |
776 | 0 | if (!llama_mmap::SUPPORTED) { |
777 | 0 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); |
778 | 0 | use_mmap = false; |
779 | 0 | } |
780 | |
|
781 | 0 | this->use_mmap = use_mmap; |
782 | 0 | this->use_direct_io = use_direct_io; |
783 | 0 | this->check_tensors = check_tensors; |
784 | 0 | this->no_alloc = no_alloc; |
785 | 0 | } |
786 | | |
787 | 0 | std::string llama_model_loader::get_arch_name() const { |
788 | 0 | return arch_name; |
789 | 0 | } |
790 | | |
791 | 0 | enum llm_arch llama_model_loader::get_arch() const { |
792 | 0 | return llm_kv.arch; |
793 | 0 | } |
794 | | |
795 | 0 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
796 | 0 | auto pos = weights_map.find(name); |
797 | 0 | if (pos != weights_map.end()) { |
798 | 0 | return &pos->second; |
799 | 0 | } |
800 | | |
801 | 0 | return nullptr; |
802 | 0 | } |
803 | | |
804 | 0 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
805 | 0 | const llama_tensor_weight * weight = get_weight(name); |
806 | 0 | if (!weight) { |
807 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); |
808 | 0 | } |
809 | 0 | return *weight; |
810 | 0 | } |
811 | | |
812 | 0 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
813 | 0 | const auto * weight = get_weight(name); |
814 | 0 | if (!weight) { |
815 | 0 | return nullptr; |
816 | 0 | } |
817 | 0 | return weight->tensor; |
818 | 0 | } |
819 | | |
820 | 0 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
821 | 0 | struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); |
822 | 0 | if (!tensor) { |
823 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
824 | 0 | } |
825 | 0 | return tensor; |
826 | 0 | } |
827 | | |
828 | 0 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
829 | 0 | const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); |
830 | |
|
831 | 0 | if (cur == NULL) { |
832 | 0 | if (!required) { |
833 | 0 | return NULL; |
834 | 0 | } |
835 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
836 | 0 | } |
837 | | |
838 | 0 | { |
839 | 0 | bool is_ok = true; |
840 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
841 | 0 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
842 | 0 | is_ok = false; |
843 | 0 | break; |
844 | 0 | } |
845 | 0 | } |
846 | 0 | if (!is_ok) { |
847 | 0 | throw std::runtime_error( |
848 | 0 | format("%s: tensor '%s' has wrong shape; expected %s, got %s", |
849 | 0 | __func__, name.c_str(), |
850 | 0 | llama_format_tensor_shape(ne).c_str(), |
851 | 0 | llama_format_tensor_shape(cur).c_str())); |
852 | 0 | } |
853 | 0 | } |
854 | | |
855 | 0 | return cur; |
856 | 0 | } |
857 | | |
858 | | // checks if the weight tensor can be used with the specified buffer type and device |
859 | 0 | static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) { |
860 | 0 | GGML_ASSERT(w != nullptr); |
861 | |
|
862 | 0 | if (op == GGML_OP_NONE) { |
863 | 0 | return true; |
864 | 0 | } |
865 | | |
866 | 0 | ggml_init_params params = { |
867 | 0 | /*.mem_size =*/ ggml_tensor_overhead()*8, |
868 | 0 | /*.mem_buffer =*/ NULL, |
869 | 0 | /*.no_alloc =*/ true, |
870 | 0 | }; |
871 | 0 | ggml_context_ptr ctx_ptr { ggml_init(params) }; |
872 | 0 | if (!ctx_ptr) { |
873 | 0 | throw std::runtime_error(format("failed to create ggml context")); |
874 | 0 | } |
875 | 0 | ggml_context * ctx = ctx_ptr.get(); |
876 | |
|
877 | 0 | ggml_tensor * op_tensor = nullptr; |
878 | |
|
879 | 0 | switch (op) { |
880 | 0 | case GGML_OP_GET_ROWS: |
881 | 0 | { |
882 | 0 | ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512); |
883 | 0 | op_tensor = ggml_get_rows(ctx, w, b); |
884 | 0 | } break; |
885 | 0 | case GGML_OP_MUL_MAT: |
886 | 0 | { |
887 | 0 | ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]); |
888 | 0 | op_tensor = ggml_mul_mat(ctx, w, b); |
889 | 0 | } break; |
890 | 0 | case GGML_OP_MUL_MAT_ID: |
891 | 0 | { |
892 | 0 | const int n_expert_used = hparams.n_expert_used; |
893 | 0 | GGML_ASSERT(n_expert_used > 0); |
894 | 0 | ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512); |
895 | 0 | ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512); |
896 | 0 | op_tensor = ggml_mul_mat_id(ctx, w, b, ids); |
897 | 0 | } break; |
898 | 0 | case GGML_OP_ADD: |
899 | 0 | { |
900 | 0 | ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); |
901 | 0 | op_tensor = ggml_add(ctx, a, w); |
902 | 0 | } break; |
903 | 0 | case GGML_OP_ADD_ID: |
904 | 0 | { |
905 | 0 | const int n_expert_used = hparams.n_expert_used; |
906 | 0 | GGML_ASSERT(n_expert_used > 0); |
907 | 0 | ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512); |
908 | 0 | ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512); |
909 | 0 | op_tensor = ggml_add_id(ctx, a, w, c); |
910 | 0 | } break; |
911 | 0 | case GGML_OP_MUL: |
912 | 0 | { |
913 | 0 | ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); |
914 | 0 | op_tensor = ggml_mul(ctx, a, w); |
915 | 0 | } break; |
916 | 0 | case GGML_OP_DIV: |
917 | 0 | { |
918 | 0 | ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]); |
919 | 0 | op_tensor = ggml_div(ctx, a, w); |
920 | 0 | } break; |
921 | 0 | case GGML_OP_ROPE: |
922 | 0 | { |
923 | 0 | const int n_embd_head = hparams.n_embd_head_v(); |
924 | 0 | const int n_head = hparams.n_head(); |
925 | 0 | ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512); |
926 | 0 | ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512); |
927 | 0 | op_tensor = ggml_rope_ext( |
928 | 0 | ctx, a, b, w, |
929 | 0 | 0, 0, 0, 0, 0, |
930 | 0 | 0, 0, 0, 0 |
931 | 0 | ); |
932 | |
|
933 | 0 | } break; |
934 | 0 | case GGML_OP_SSM_CONV: |
935 | 0 | { |
936 | 0 | const int64_t n_seq_tokens = 512; |
937 | 0 | const int64_t n_seqs = 3; |
938 | 0 | ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs); |
939 | 0 | op_tensor = ggml_ssm_conv(ctx, conv_x, w); |
940 | 0 | } break; |
941 | 0 | case GGML_OP_SSM_SCAN: |
942 | 0 | { |
943 | | // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2 |
944 | 0 | const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0]; |
945 | 0 | const int64_t n_head = w->ne[1]; |
946 | 0 | const int64_t head_dim = hparams.ssm_d_inner / n_head; |
947 | 0 | const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1; |
948 | 0 | const int64_t n_seq_tokens = 512; |
949 | 0 | const int64_t n_seqs = 3; |
950 | 0 | ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs); |
951 | 0 | ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs); |
952 | 0 | ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs); |
953 | 0 | ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
954 | 0 | ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
955 | 0 | ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs); |
956 | 0 | op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids); |
957 | 0 | } break; |
958 | 0 | case GGML_OP_RWKV_WKV6: |
959 | 0 | { |
960 | | // FIXME |
961 | 0 | const int64_t S = 123; |
962 | 0 | const int64_t H = 123; |
963 | 0 | const int64_t n_tokens = 123; |
964 | 0 | const int64_t n_seqs = 123; |
965 | 0 | ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
966 | 0 | ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
967 | 0 | ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
968 | 0 | ggml_tensor * tf = w; |
969 | 0 | ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
970 | 0 | ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); |
971 | 0 | op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); |
972 | 0 | } break; |
973 | 0 | case GGML_OP_IM2COL: |
974 | 0 | { |
975 | 0 | const int n_embd_inp = hparams.n_embd_inp(); |
976 | 0 | ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1); |
977 | 0 | op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); |
978 | 0 | } break; |
979 | 0 | case GGML_OP_SCALE: |
980 | 0 | { |
981 | 0 | op_tensor = ggml_scale(ctx, w, 1.0f); |
982 | 0 | } break; |
983 | 0 | default: |
984 | 0 | GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); |
985 | 0 | } |
986 | | |
987 | | // create a temporary dummy buffer for the weight so that supports_op can check the buffer type |
988 | 0 | GGML_ASSERT(w->buffer == nullptr); |
989 | 0 | w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); |
990 | 0 | bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); |
991 | 0 | ggml_backend_buffer_free(w->buffer); |
992 | 0 | w->buffer = nullptr; |
993 | |
|
994 | 0 | return op_supported; |
995 | 0 | } |
996 | | |
997 | | // find the first buffer type in the list that can use the tensor |
998 | 0 | static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) { |
999 | 0 | GGML_ASSERT(!buft_list->empty()); |
1000 | 0 | for (const auto & cur : *buft_list) { |
1001 | 0 | ggml_backend_dev_t cur_dev = cur.first; |
1002 | 0 | ggml_backend_buffer_type_t cur_buft = cur.second; |
1003 | 0 | if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) { |
1004 | 0 | return cur_buft; |
1005 | 0 | } |
1006 | 0 | } |
1007 | | |
1008 | 0 | return nullptr; |
1009 | 0 | } |
1010 | | |
1011 | | struct ggml_tensor * llama_model_loader::create_tensor( |
1012 | | const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output, |
1013 | 0 | const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) { |
1014 | 0 | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
1015 | 0 | auto it = ctx_map.find(buft); |
1016 | 0 | if (it == ctx_map.end()) { |
1017 | | // one ggml context per buffer type |
1018 | 0 | int max_n_tensors = n_tensors; |
1019 | 0 | max_n_tensors += 1; // duplicated output tensor |
1020 | 0 | max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors |
1021 | 0 | if (files.empty()) { |
1022 | 0 | max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses |
1023 | 0 | } |
1024 | 0 | const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; |
1025 | |
|
1026 | 0 | ggml_init_params params = { |
1027 | 0 | /*.mem_size =*/ ctx_size, |
1028 | 0 | /*.mem_buffer =*/ NULL, |
1029 | 0 | /*.no_alloc =*/ true, |
1030 | 0 | }; |
1031 | |
|
1032 | 0 | ggml_context * ctx = ggml_init(params); |
1033 | 0 | if (!ctx) { |
1034 | 0 | throw std::runtime_error(format("failed to create ggml context")); |
1035 | 0 | } |
1036 | | |
1037 | 0 | ctx_map.emplace(buft, ctx); |
1038 | |
|
1039 | 0 | return ctx; |
1040 | 0 | } |
1041 | 0 | return it->second.get(); |
1042 | 0 | }; |
1043 | |
|
1044 | 0 | auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t { |
1045 | 0 | if (!t_meta) { |
1046 | 0 | if (flags & TENSOR_NOT_REQUIRED) { |
1047 | 0 | return nullptr; |
1048 | 0 | } |
1049 | 0 | throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); |
1050 | 0 | } |
1051 | | |
1052 | | // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops |
1053 | | // the tensor is duplicated |
1054 | | // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor |
1055 | 0 | llm_tensor tn_tensor = tn.tensor; |
1056 | 0 | if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) { |
1057 | 0 | tn_tensor = LLM_TENSOR_OUTPUT; |
1058 | 0 | } |
1059 | |
|
1060 | 0 | llm_tensor_info info; |
1061 | 0 | try { |
1062 | 0 | info = llm_tensor_info_for(tn_tensor); |
1063 | 0 | } catch (const std::out_of_range & e) { |
1064 | 0 | throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); |
1065 | 0 | } |
1066 | | |
1067 | | // skip unused tensors |
1068 | 0 | if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) { |
1069 | 0 | const size_t nbytes = ggml_nbytes(t_meta); |
1070 | 0 | LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes); |
1071 | |
|
1072 | 0 | size_data -= nbytes; |
1073 | 0 | n_created++; |
1074 | |
|
1075 | 0 | return nullptr; |
1076 | 0 | } |
1077 | | |
1078 | | // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID |
1079 | 0 | ggml_op op; |
1080 | 0 | bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0; |
1081 | 0 | if (bias) { |
1082 | 0 | if (info.op == GGML_OP_MUL_MAT_ID) { |
1083 | 0 | op = GGML_OP_ADD_ID; |
1084 | 0 | } else { |
1085 | 0 | op = GGML_OP_ADD; |
1086 | 0 | } |
1087 | 0 | } else { |
1088 | 0 | op = info.op; |
1089 | 0 | } |
1090 | | |
1091 | | // sanity checks |
1092 | 0 | if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) { |
1093 | 0 | if (tn.bid != -1) { |
1094 | 0 | GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str()); |
1095 | 0 | } |
1096 | 0 | } else { |
1097 | 0 | if (tn.bid == -1) { |
1098 | 0 | GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str()); |
1099 | 0 | } |
1100 | 0 | } |
1101 | | |
1102 | | // select the buffer type for this tensor |
1103 | 0 | const buft_list_t * buft_list; |
1104 | 0 | switch (info.layer) { |
1105 | 0 | case LLM_TENSOR_LAYER_INPUT: |
1106 | 0 | buft_list = buft_list_input; |
1107 | 0 | break; |
1108 | 0 | case LLM_TENSOR_LAYER_OUTPUT: |
1109 | 0 | buft_list = buft_list_output; |
1110 | 0 | break; |
1111 | 0 | case LLM_TENSOR_LAYER_REPEATING: |
1112 | 0 | GGML_ASSERT(buft_list_layer != nullptr); |
1113 | 0 | buft_list = buft_list_layer; |
1114 | 0 | break; |
1115 | 0 | default: |
1116 | 0 | GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); |
1117 | 0 | } |
1118 | | |
1119 | 0 | ggml_backend_buffer_type_t buft = nullptr; |
1120 | | |
1121 | | // check overrides |
1122 | 0 | if (tensor_buft_overrides) { |
1123 | 0 | std::string tensor_name = tn.str(); |
1124 | 0 | for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { |
1125 | 0 | std::regex pattern(overrides->pattern); |
1126 | 0 | if (std::regex_search(tensor_name, pattern)) { |
1127 | 0 | if (overrides->buft == ggml_backend_cpu_buffer_type()) { |
1128 | | // when overriding to a CPU buffer, consider the extra buffer types |
1129 | 0 | buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu); |
1130 | 0 | } else { |
1131 | 0 | buft = overrides->buft; |
1132 | 0 | } |
1133 | |
|
1134 | 0 | LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", |
1135 | 0 | tensor_name.c_str(), |
1136 | 0 | ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), |
1137 | 0 | ggml_backend_buft_name(buft)); |
1138 | 0 | break; |
1139 | 0 | } |
1140 | 0 | } |
1141 | 0 | } |
1142 | |
|
1143 | 0 | if (!buft) { |
1144 | 0 | buft = select_weight_buft(hparams, t_meta, op, buft_list); |
1145 | 0 | if (!buft) { |
1146 | 0 | throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); |
1147 | 0 | } |
1148 | 0 | } |
1149 | | |
1150 | | // avoid using a host buffer when using mmap |
1151 | 0 | auto * buft_dev = ggml_backend_buft_get_device(buft); |
1152 | 0 | if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { |
1153 | 0 | auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1154 | 0 | if (!cpu_dev) { |
1155 | 0 | throw std::runtime_error("no CPU backend found"); |
1156 | 0 | } |
1157 | 0 | buft = ggml_backend_dev_buffer_type(cpu_dev); |
1158 | 0 | } |
1159 | | |
1160 | 0 | if (buft != buft_list->front().second) { |
1161 | 0 | if (n_tensors_moved == 0) { |
1162 | 0 | first_tensor_moved_name = t_meta->name; |
1163 | 0 | first_tensor_moved_type_name = ggml_type_name(t_meta->type); |
1164 | 0 | first_moved_from_buft = buft_list->front().second; |
1165 | 0 | first_moved_to_buft = buft; |
1166 | 0 | } |
1167 | 0 | n_tensors_moved++; |
1168 | 0 | } |
1169 | |
|
1170 | 0 | return buft; |
1171 | 0 | }; |
1172 | |
|
1173 | 0 | if (files.empty()) { |
1174 | 0 | if (flags & TENSOR_SKIP_IF_VIRTUAL) { |
1175 | 0 | return nullptr; |
1176 | 0 | } |
1177 | 0 | ggml_type type = GGML_TYPE_F32; |
1178 | 0 | const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str()); |
1179 | 0 | if (tid != -1) { |
1180 | 0 | type = gguf_get_tensor_type(metadata, tid); |
1181 | 0 | } |
1182 | | |
1183 | | // for tensors that are not required some of the dimensions can be invalid: |
1184 | 0 | if (flags & TENSOR_NOT_REQUIRED) { |
1185 | 0 | for (size_t dim = 0; dim < ne.size(); dim++) { |
1186 | 0 | if (ne.begin()[dim] <= 0) { |
1187 | 0 | return nullptr; |
1188 | 0 | } |
1189 | 0 | } |
1190 | 0 | } |
1191 | | |
1192 | 0 | ggml_tensor t_meta; |
1193 | 0 | memset(&t_meta, 0, sizeof(ggml_tensor)); |
1194 | 0 | t_meta.type = type; |
1195 | 0 | for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) { |
1196 | 0 | t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1; |
1197 | 0 | GGML_ASSERT(t_meta.ne[dim] >= 1); |
1198 | 0 | t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1]; |
1199 | 0 | GGML_ASSERT(t_meta.nb[dim] >= 1); |
1200 | 0 | } |
1201 | 0 | ggml_set_name(&t_meta, tn.str().c_str()); |
1202 | |
|
1203 | 0 | ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta); |
1204 | 0 | GGML_ASSERT(buft != nullptr); |
1205 | 0 | ggml_context * ctx = ctx_for_buft(buft); |
1206 | 0 | ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta); |
1207 | 0 | ggml_set_name(ret, tn.str().c_str()); |
1208 | 0 | return ret; |
1209 | 0 | } |
1210 | | |
1211 | 0 | ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str()); |
1212 | 0 | ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta); |
1213 | 0 | if (buft == nullptr) { |
1214 | 0 | return nullptr; // return type is ggml_tensor * |
1215 | 0 | } |
1216 | 0 | ggml_context * ctx = ctx_for_buft(buft); |
1217 | | |
1218 | | // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one |
1219 | 0 | if (flags & TENSOR_DUPLICATED) { |
1220 | 0 | ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); |
1221 | 0 | if (t) { |
1222 | 0 | return t; |
1223 | 0 | } |
1224 | 0 | } |
1225 | | |
1226 | 0 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str()); |
1227 | 0 | const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED)); |
1228 | |
|
1229 | 0 | if (cur == NULL) { |
1230 | 0 | return NULL; |
1231 | 0 | } |
1232 | | |
1233 | 0 | const bool duplicated = flags & TENSOR_DUPLICATED; |
1234 | |
|
1235 | 0 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); |
1236 | 0 | ggml_set_name(tensor, ggml_get_name(cur)); |
1237 | |
|
1238 | 0 | if (duplicated) { |
1239 | 0 | size_data += ggml_nbytes(cur); |
1240 | 0 | } else { |
1241 | 0 | n_created++; |
1242 | 0 | } |
1243 | |
|
1244 | 0 | return tensor; |
1245 | 0 | } |
1246 | | |
1247 | 0 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
1248 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
1249 | |
|
1250 | 0 | if (cur == NULL) { |
1251 | 0 | return NULL; |
1252 | 0 | } |
1253 | | |
1254 | 0 | if (cur->type != base->type) { |
1255 | 0 | throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); |
1256 | 0 | } |
1257 | | |
1258 | 0 | std::array<int64_t, GGML_MAX_DIMS> dims; |
1259 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
1260 | 0 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
1261 | 0 | } |
1262 | |
|
1263 | 0 | struct ggml_tensor * tensor = ggml_view_4d(ctx, base, |
1264 | 0 | dims[0], dims[1], dims[2], dims[3], |
1265 | 0 | cur->nb[1], cur->nb[2], cur->nb[3], |
1266 | 0 | offset); |
1267 | |
|
1268 | 0 | ggml_set_name(tensor, name.c_str()); |
1269 | |
|
1270 | 0 | n_created++; |
1271 | |
|
1272 | 0 | return tensor; |
1273 | 0 | } |
1274 | | |
1275 | 0 | void llama_model_loader::done_getting_tensors() const { |
1276 | 0 | if (n_created != n_tensors) { |
1277 | 0 | throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); |
1278 | 0 | } |
1279 | 0 | if (n_tensors_moved > 0) { |
1280 | 0 | LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n", |
1281 | 0 | __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1, |
1282 | 0 | ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); |
1283 | 0 | } |
1284 | 0 | } |
1285 | | |
1286 | 0 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
1287 | 0 | if (use_mmap) { |
1288 | 0 | mappings.reserve(files.size()); |
1289 | 0 | mmaps_used.reserve(files.size()); |
1290 | 0 | for (const auto & file : files) { |
1291 | 0 | bool is_numa = false; |
1292 | |
|
1293 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1294 | 0 | if (dev) { |
1295 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
1296 | 0 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); |
1297 | 0 | if (is_numa_fn) { |
1298 | 0 | is_numa = is_numa_fn(); |
1299 | 0 | } |
1300 | 0 | } |
1301 | |
|
1302 | 0 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa); |
1303 | 0 | mmaps_used.emplace_back(mapping->size(), 0); |
1304 | 0 | if (mlock_mmaps) { |
1305 | 0 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
1306 | 0 | mlock_mmap->init(mapping->addr()); |
1307 | 0 | mlock_mmaps->emplace_back(std::move(mlock_mmap)); |
1308 | 0 | } |
1309 | 0 | mappings.emplace_back(std::move(mapping)); |
1310 | 0 | } |
1311 | 0 | } |
1312 | | |
1313 | | // compute the total size of all tensors for progress reporting |
1314 | 0 | for (const auto & it : weights_map) { |
1315 | 0 | size_data += ggml_nbytes(it.second.tensor); |
1316 | 0 | } |
1317 | 0 | } |
1318 | | |
1319 | 0 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
1320 | 0 | GGML_ASSERT(!mappings.empty()); |
1321 | 0 | const auto & mapping = mappings.at(idx); |
1322 | |
|
1323 | 0 | *first = mapping->size(); |
1324 | 0 | *last = 0; |
1325 | 0 | *addr = mapping->addr(); |
1326 | 0 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
1327 | 0 | const auto * weight = get_weight(ggml_get_name(tensor)); |
1328 | 0 | if (!weight || weight->idx != idx) { |
1329 | 0 | continue; |
1330 | 0 | } |
1331 | 0 | *first = std::min(*first, weight->offs); |
1332 | 0 | *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); |
1333 | 0 | } |
1334 | 0 | } |
1335 | | |
1336 | 0 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
1337 | 0 | const auto & w = require_weight(ggml_get_name(cur)); |
1338 | |
|
1339 | 0 | if (use_mmap) { |
1340 | 0 | const auto & mapping = mappings.at(w.idx); |
1341 | 0 | if (cur->data == nullptr) { |
1342 | 0 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
1343 | 0 | } else { |
1344 | 0 | memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); |
1345 | 0 | } |
1346 | 0 | } else { |
1347 | 0 | GGML_ASSERT(cur->data != nullptr); |
1348 | 0 | GGML_ASSERT(w.idx < files.size()); |
1349 | 0 | const auto & file = files.at(w.idx); |
1350 | 0 | file->seek(w.offs, SEEK_SET); |
1351 | 0 | file->read_raw(cur->data, ggml_nbytes(cur)); |
1352 | 0 | } |
1353 | |
|
1354 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { |
1355 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1356 | 0 | } |
1357 | 0 | } |
1358 | | |
1359 | | bool llama_model_loader::load_all_data( |
1360 | | struct ggml_context * ctx, |
1361 | | llama_buf_map & bufs, |
1362 | | llama_mlocks * lmlocks, |
1363 | | llama_progress_callback progress_callback, |
1364 | 0 | void * progress_callback_user_data) { |
1365 | 0 | if (files.empty()) { |
1366 | 0 | for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { |
1367 | 0 | set_tensor_data(t, set_tensor_data_ud); |
1368 | 0 | } |
1369 | 0 | return true; |
1370 | 0 | } |
1371 | 0 | GGML_ASSERT(size_data != 0 && "call init_mappings() first"); |
1372 | |
|
1373 | 0 | std::vector<no_init<uint8_t>> read_buf; |
1374 | 0 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
1375 | | |
1376 | | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
1377 | | // NVMe raid configurations might require more / larger buffers. |
1378 | 0 | constexpr size_t n_buffers = 4; |
1379 | |
|
1380 | 0 | size_t alignment = 1; |
1381 | 0 | for (const auto & file : files) { |
1382 | 0 | alignment = std::max(file->read_alignment(), alignment); |
1383 | 0 | } |
1384 | | |
1385 | | // Buffer size: balance between memory usage and I/O efficiency |
1386 | | // 64MB works well for NVMe drives |
1387 | 0 | const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; |
1388 | |
|
1389 | 0 | std::vector<ggml_backend_buffer_t> host_buffers; |
1390 | 0 | std::vector<ggml_backend_event_t> events; |
1391 | 0 | std::vector<void *> host_ptrs; |
1392 | 0 | size_t buffer_idx = 0; // buffer to use for async loads |
1393 | 0 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
1394 | 0 | if (use_mmap || check_tensors) { |
1395 | 0 | return nullptr; |
1396 | 0 | } |
1397 | | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
1398 | | // First determine if the backend supports the necessary features for async uploads. |
1399 | 0 | auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; |
1400 | 0 | if (!buf) { |
1401 | 0 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); |
1402 | 0 | return nullptr; |
1403 | 0 | } |
1404 | | |
1405 | 0 | auto * buft = ggml_backend_buffer_get_type(buf); |
1406 | 0 | auto * dev = ggml_backend_buft_get_device(buft); |
1407 | 0 | if (!dev) { |
1408 | 0 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, |
1409 | 0 | ggml_backend_buft_name(buft)); |
1410 | 0 | return nullptr; |
1411 | 0 | } |
1412 | | |
1413 | 0 | if (buft != ggml_backend_dev_buffer_type(dev)) { |
1414 | 0 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, |
1415 | 0 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
1416 | 0 | return nullptr; |
1417 | 0 | } |
1418 | | |
1419 | 0 | ggml_backend_dev_props props; |
1420 | 0 | ggml_backend_dev_get_props(dev, &props); |
1421 | 0 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
1422 | 0 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, |
1423 | 0 | ggml_backend_dev_name(dev)); |
1424 | 0 | return nullptr; |
1425 | 0 | } |
1426 | | |
1427 | 0 | auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
1428 | 0 | if (!host_buft) { |
1429 | 0 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, |
1430 | 0 | ggml_backend_dev_name(dev)); |
1431 | 0 | return nullptr; |
1432 | 0 | } |
1433 | | |
1434 | | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
1435 | 0 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
1436 | 0 | auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); |
1437 | |
|
1438 | 0 | if (!buf) { |
1439 | 0 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, |
1440 | 0 | ggml_backend_dev_name(dev)); |
1441 | 0 | return nullptr; |
1442 | 0 | } |
1443 | | |
1444 | 0 | host_buffers.emplace_back(buf); |
1445 | 0 | host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); |
1446 | |
|
1447 | 0 | auto * event = ggml_backend_event_new(dev); |
1448 | 0 | if (!event) { |
1449 | 0 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, |
1450 | 0 | ggml_backend_dev_name(dev)); |
1451 | 0 | return nullptr; |
1452 | 0 | } |
1453 | | |
1454 | 0 | events.emplace_back(event); |
1455 | 0 | } |
1456 | | |
1457 | 0 | ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
1458 | 0 | if (!backend) { |
1459 | 0 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, |
1460 | 0 | ggml_backend_dev_name(dev)); |
1461 | 0 | return nullptr; |
1462 | 0 | } |
1463 | | |
1464 | 0 | return backend; |
1465 | 0 | }(__func__); |
1466 | |
|
1467 | 0 | if (upload_backend) { |
1468 | 0 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, |
1469 | 0 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
1470 | 0 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
1471 | 0 | ggml_backend_name(upload_backend)); |
1472 | 0 | } |
1473 | |
|
1474 | 0 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { |
1475 | 0 | const auto * weight = get_weight(ggml_get_name(cur)); |
1476 | 0 | if (weight == nullptr) { |
1477 | | // this can happen with split experts models |
1478 | 0 | continue; |
1479 | 0 | } |
1480 | | |
1481 | 0 | if (progress_callback) { |
1482 | 0 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
1483 | 0 | return false; |
1484 | 0 | } |
1485 | 0 | } |
1486 | | |
1487 | 0 | size_t n_size = ggml_nbytes(cur); |
1488 | |
|
1489 | 0 | if (use_mmap) { |
1490 | 0 | const auto & mapping = mappings.at(weight->idx); |
1491 | 0 | ggml_backend_buffer_t buf_mmap = nullptr; |
1492 | 0 | if (bufs.count(weight->idx)) { |
1493 | 0 | buf_mmap = bufs.at(weight->idx); |
1494 | 0 | } |
1495 | 0 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
1496 | |
|
1497 | 0 | if (check_tensors) { |
1498 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { |
1499 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); |
1500 | 0 | })); |
1501 | 0 | } |
1502 | |
|
1503 | 0 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
1504 | 0 | if (buf_mmap && cur->data == nullptr) { |
1505 | 0 | ggml_backend_tensor_alloc(buf_mmap, cur, data); |
1506 | 0 | if (lmlocks) { |
1507 | 0 | const auto & lmlock = lmlocks->at(weight->idx); |
1508 | 0 | lmlock->grow_to(weight->offs + n_size); |
1509 | 0 | } |
1510 | |
|
1511 | 0 | auto & mmap_used = mmaps_used[weight->idx]; |
1512 | 0 | mmap_used.first = std::min(mmap_used.first, weight->offs); |
1513 | 0 | mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); |
1514 | 0 | } else { |
1515 | 0 | ggml_backend_tensor_set(cur, data, 0, n_size); |
1516 | 0 | } |
1517 | 0 | } else { |
1518 | 0 | const auto & file = files.at(weight->idx); |
1519 | |
|
1520 | 0 | if (ggml_backend_buffer_is_host(cur->buffer)) { |
1521 | 0 | file->seek(weight->offs, SEEK_SET); |
1522 | 0 | file->read_raw(cur->data, n_size); |
1523 | 0 | if (check_tensors) { |
1524 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { |
1525 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); |
1526 | 0 | })); |
1527 | 0 | } |
1528 | 0 | } else { |
1529 | | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
1530 | 0 | if (upload_backend) { |
1531 | 0 | size_t offset = weight->offs; |
1532 | 0 | alignment = file->read_alignment(); |
1533 | 0 | size_t aligned_offset = offset & ~(alignment - 1); |
1534 | 0 | size_t offset_from_alignment = offset - aligned_offset; |
1535 | 0 | file->seek(aligned_offset, SEEK_SET); |
1536 | | |
1537 | | // Calculate aligned read boundaries |
1538 | 0 | size_t read_start = aligned_offset; |
1539 | 0 | size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); |
1540 | |
|
1541 | 0 | size_t bytes_read = 0; |
1542 | 0 | size_t data_read = 0; // Actual tensor data copied (excluding padding) |
1543 | |
|
1544 | 0 | while (bytes_read < read_end - read_start) { |
1545 | 0 | size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read); |
1546 | | |
1547 | | // Align the destination pointer within the pinned buffer |
1548 | 0 | uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); |
1549 | | |
1550 | | // Wait for previous upload to complete before reusing buffer |
1551 | 0 | ggml_backend_event_synchronize(events[buffer_idx]); |
1552 | | |
1553 | | // Read aligned chunk from file |
1554 | 0 | file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size); |
1555 | | |
1556 | | // Calculate actual data portion (excluding alignment padding) |
1557 | 0 | uintptr_t ptr_data = ptr_dest_aligned; |
1558 | 0 | size_t data_to_copy = read_size; |
1559 | | |
1560 | | // Skip alignment padding at start of first chunk |
1561 | 0 | if (bytes_read == 0) { |
1562 | 0 | ptr_data += offset_from_alignment; |
1563 | 0 | data_to_copy -= offset_from_alignment; |
1564 | 0 | } |
1565 | | |
1566 | | // Trim alignment padding at end of last chunk |
1567 | 0 | if (aligned_offset + bytes_read + read_size > offset + n_size) { |
1568 | 0 | data_to_copy -= (read_end - (offset + n_size)); |
1569 | 0 | } |
1570 | | |
1571 | | // Async upload actual data to GPU |
1572 | 0 | ggml_backend_tensor_set_async(upload_backend, cur, |
1573 | 0 | reinterpret_cast<void *>(ptr_data), data_read, data_to_copy); |
1574 | 0 | ggml_backend_event_record(events[buffer_idx], upload_backend); |
1575 | |
|
1576 | 0 | data_read += data_to_copy; |
1577 | 0 | bytes_read += read_size; |
1578 | |
|
1579 | 0 | ++buffer_idx; |
1580 | 0 | buffer_idx %= n_buffers; |
1581 | 0 | } |
1582 | 0 | } else { |
1583 | 0 | read_buf.resize(n_size); |
1584 | 0 | file->seek(weight->offs, SEEK_SET); |
1585 | 0 | file->read_raw(read_buf.data(), n_size); |
1586 | 0 | ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); |
1587 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { |
1588 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1589 | 0 | } |
1590 | 0 | } |
1591 | 0 | } |
1592 | 0 | } |
1593 | | |
1594 | 0 | size_done += n_size; |
1595 | 0 | } |
1596 | | |
1597 | | // free temporary resources used for async uploads |
1598 | 0 | for (auto * event : events) { |
1599 | 0 | ggml_backend_event_synchronize(event); |
1600 | 0 | ggml_backend_event_free(event); |
1601 | 0 | } |
1602 | 0 | for (auto * buf : host_buffers) { |
1603 | 0 | ggml_backend_buffer_free(buf); |
1604 | 0 | } |
1605 | 0 | ggml_backend_free(upload_backend); |
1606 | | |
1607 | | // check validation results |
1608 | 0 | bool validation_failed = false; |
1609 | 0 | for (auto & future : validation_result) { |
1610 | 0 | auto result = future.get(); |
1611 | 0 | if (!result.second) { |
1612 | 0 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); |
1613 | 0 | validation_failed = true; |
1614 | 0 | } |
1615 | 0 | } |
1616 | 0 | if (validation_failed) { |
1617 | 0 | throw std::runtime_error("found tensors with invalid data"); |
1618 | 0 | } |
1619 | | |
1620 | | // check if this is the last call and do final cleanup |
1621 | 0 | if (size_done >= size_data) { |
1622 | | // unmap offloaded tensors and metadata |
1623 | 0 | if (use_mmap) { |
1624 | 0 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
1625 | 0 | const auto & mmap_used = mmaps_used.at(idx); |
1626 | 0 | auto & mapping = mappings.at(idx); |
1627 | 0 | mapping->unmap_fragment(0, mmap_used.first); |
1628 | 0 | if (mmap_used.second != 0) { |
1629 | 0 | mapping->unmap_fragment(mmap_used.second, mapping->size()); |
1630 | 0 | } |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | if (progress_callback) { |
1634 | | // Even though the model is done loading, we still honor |
1635 | | // cancellation since we need to free allocations. |
1636 | 0 | return progress_callback(1.0f, progress_callback_user_data); |
1637 | 0 | } |
1638 | 0 | } |
1639 | | |
1640 | 0 | return true; |
1641 | 0 | } |
1642 | | |
1643 | 0 | std::string llama_model_loader::ftype_name() const { |
1644 | 0 | return llama_model_ftype_name(ftype); |
1645 | 0 | } |
1646 | | |
1647 | 0 | void llama_model_loader::print_info() const { |
1648 | 0 | LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); |
1649 | 0 | LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); |
1650 | 0 | if (n_bytes < GiB) { |
1651 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
1652 | 0 | } else { |
1653 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
1654 | 0 | } |
1655 | 0 | } |