/src/llama.cpp/src/llama-model-loader.cpp
Line | Count | Source |
1 | | #include "llama-model-loader.h" |
2 | | |
3 | | #include "ggml-alloc.h" |
4 | | #include "ggml.h" |
5 | | #include "gguf.h" |
6 | | #include "llama-hparams.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <array> |
10 | | #include <cinttypes> |
11 | | #include <cstdint> |
12 | | #include <cstring> |
13 | | #include <future> |
14 | | #include <regex> |
15 | | |
16 | | static const size_t kiB = 1024; |
17 | | static const size_t MiB = 1024*kiB; |
18 | | static const size_t GiB = 1024*MiB; |
19 | | |
20 | 0 | const char * llama_file_version_name(llama_fver version) { |
21 | 0 | switch (version) { |
22 | 0 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; |
23 | 0 | case GGUF_FILE_VERSION_V2: return "GGUF V2"; |
24 | 0 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; |
25 | 0 | } |
26 | | |
27 | 0 | return "unknown"; |
28 | 0 | } |
29 | | |
30 | 0 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
31 | 0 | if (ftype & LLAMA_FTYPE_GUESSED) { |
32 | 0 | return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; |
33 | 0 | } |
34 | | |
35 | 0 | switch (ftype) { |
36 | 0 | case LLAMA_FTYPE_ALL_F32: return "all F32"; |
37 | 0 | case LLAMA_FTYPE_MOSTLY_F16: return "F16"; |
38 | 0 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |
39 | 0 | case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0"; |
40 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |
41 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |
42 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |
43 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |
44 | 0 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |
45 | 0 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; |
46 | 0 | case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4"; |
47 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; |
48 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |
49 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |
50 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |
51 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |
52 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |
53 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |
54 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |
55 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |
56 | 0 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; |
57 | 0 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; |
58 | 0 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; |
59 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; |
60 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |
61 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; |
62 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; |
63 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |
64 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; |
65 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; |
66 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; |
67 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |
68 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |
69 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; |
70 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; |
71 | | |
72 | 0 | default: return "unknown, may not work"; |
73 | 0 | } |
74 | 0 | } |
75 | | |
76 | | // return a list of splits for a given path |
77 | | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
78 | 0 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
79 | 0 | std::vector<std::string> paths; |
80 | 0 | std::string split_prefix; |
81 | 0 | std::vector<char> buf(llama_path_max(), 0); |
82 | |
|
83 | 0 | { |
84 | 0 | int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); |
85 | 0 | if (!ret) { |
86 | 0 | throw std::runtime_error(format("invalid split file name: %s", path.c_str())); |
87 | 0 | } |
88 | 0 | split_prefix = std::string(buf.data(), ret); |
89 | 0 | } |
90 | | |
91 | 0 | if (split_prefix.empty()) { |
92 | 0 | throw std::runtime_error(format("invalid split file: %s", path.c_str())); |
93 | 0 | } |
94 | | |
95 | 0 | for (int idx = 0; idx < n_split; ++idx) { |
96 | 0 | int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); |
97 | 0 | paths.push_back(std::string(buf.data(), ret)); |
98 | 0 | } |
99 | |
|
100 | 0 | return paths; |
101 | 0 | } |
102 | | |
103 | | namespace GGUFMeta { |
104 | | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
105 | | struct GKV_Base_Type { |
106 | | static constexpr gguf_type gt = gt_; |
107 | | |
108 | 0 | static T getter(const gguf_context * ctx, const int kid) { |
109 | 0 | return gfun(ctx, kid); |
110 | 0 | } Unexecuted instantiation: GGUFMeta::GKV_Base_Type<bool, (gguf_type)7, &gguf_get_val_bool>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<float, (gguf_type)6, &gguf_get_val_f32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned int, (gguf_type)4, &gguf_get_val_u32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned short, (gguf_type)2, &gguf_get_val_u16>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<int, (gguf_type)5, &gguf_get_val_i32>::getter(gguf_context const*, int) |
111 | | }; |
112 | | |
113 | | template<typename T> struct GKV_Base; |
114 | | |
115 | | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
116 | | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
117 | | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
118 | | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
119 | | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
120 | | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
121 | | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
122 | | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
123 | | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
124 | | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
125 | | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
126 | | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
127 | | |
128 | | template<> struct GKV_Base<std::string> { |
129 | | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
130 | | |
131 | 0 | static std::string getter(const gguf_context * ctx, const int kid) { |
132 | 0 | return gguf_get_val_str(ctx, kid); |
133 | 0 | } |
134 | | }; |
135 | | |
136 | | struct ArrayInfo { |
137 | | const gguf_type gt; |
138 | | const size_t length; |
139 | | const void * data; |
140 | | }; |
141 | | |
142 | | template<> struct GKV_Base<ArrayInfo> { |
143 | | public: |
144 | | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
145 | 0 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
146 | 0 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); |
147 | 0 | return ArrayInfo { |
148 | 0 | arr_type, |
149 | 0 | size_t(gguf_get_arr_n(ctx, k)), |
150 | 0 | arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), |
151 | 0 | }; |
152 | 0 | } |
153 | | }; |
154 | | |
155 | | template<typename T> |
156 | | class GKV : public GKV_Base<T> { |
157 | | GKV() = delete; |
158 | | |
159 | | public: |
160 | 0 | static T get_kv(const gguf_context * ctx, const int k) { |
161 | 0 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); |
162 | |
|
163 | 0 | if (kt != GKV::gt) { |
164 | 0 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", |
165 | 0 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); |
166 | 0 | } |
167 | 0 | return GKV::getter(ctx, k); |
168 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<bool>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<float>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<int>::get_kv(gguf_context const*, int) |
169 | | |
170 | 0 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
171 | 0 | switch (ty) { |
172 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; |
173 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; |
174 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; |
175 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; |
176 | 0 | } |
177 | 0 | return "unknown"; |
178 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<float>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<int>::override_type_to_str(llama_model_kv_override_type) |
179 | | |
180 | 0 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
181 | 0 | if (!ovrd) { return false; } |
182 | 0 | if (ovrd->tag == expected_type) { |
183 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", |
184 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
185 | 0 | switch (ovrd->tag) { |
186 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
187 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); |
188 | 0 | } break; |
189 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
190 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); |
191 | 0 | } break; |
192 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
193 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); |
194 | 0 | } break; |
195 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
196 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); |
197 | 0 | } break; |
198 | 0 | default: |
199 | | // Shouldn't be possible to end up here, but just in case... |
200 | 0 | throw std::runtime_error( |
201 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", |
202 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); |
203 | 0 | } |
204 | 0 | return true; |
205 | 0 | } |
206 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", |
207 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
208 | 0 | return false; |
209 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) |
210 | | |
211 | | template<typename OT> |
212 | | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
213 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
214 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
215 | 0 | target = ovrd->val_bool; |
216 | 0 | return true; |
217 | 0 | } |
218 | 0 | return false; |
219 | 0 | } |
220 | | |
221 | | template<typename OT> |
222 | | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
223 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
224 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
225 | 0 | target = ovrd->val_i64; |
226 | 0 | return true; |
227 | 0 | } |
228 | 0 | return false; |
229 | 0 | } Unexecuted instantiation: _ZN8GGUFMeta3GKVIjE12try_overrideIjEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVItE12try_overrideItEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVIiE12try_overrideIiEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override |
230 | | |
231 | | template<typename OT> |
232 | | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
233 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
234 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
235 | 0 | target = ovrd->val_f64; |
236 | 0 | return true; |
237 | 0 | } |
238 | 0 | return false; |
239 | 0 | } |
240 | | |
241 | | template<typename OT> |
242 | | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
243 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
244 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
245 | 0 | target = ovrd->val_str; |
246 | 0 | return true; |
247 | 0 | } |
248 | 0 | return false; |
249 | 0 | } |
250 | | |
251 | 0 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
252 | 0 | if (try_override<T>(target, ovrd)) { |
253 | 0 | return true; |
254 | 0 | } |
255 | 0 | if (k < 0) { return false; } |
256 | 0 | target = get_kv(ctx, k); |
257 | 0 | return true; |
258 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, int, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, int, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, int, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, int, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, int, int&, llama_model_kv_override const*) |
259 | | |
260 | 0 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
261 | 0 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
262 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, char const*, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, char const*, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, char const*, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, char const*, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, char const*, int&, llama_model_kv_override const*) |
263 | | |
264 | 0 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
265 | 0 | return set(ctx, key.c_str(), target, ovrd); |
266 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, llama_model_kv_override const*) |
267 | | }; |
268 | | } |
269 | | |
270 | | template<typename T> |
271 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
272 | 0 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
273 | 0 | const int kid = gguf_find_key(metadata, key.c_str()); |
274 | |
|
275 | 0 | if (kid < 0) { |
276 | 0 | if (required) { |
277 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
278 | 0 | } |
279 | 0 | return false; |
280 | 0 | } |
281 | | |
282 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
283 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid); |
284 | | |
285 | |
|
286 | 0 | result = arr_info.length; |
287 | 0 | return true; |
288 | 0 | } |
289 | | |
290 | | template<typename T> |
291 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
292 | 0 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
293 | 0 | return get_arr_n(llm_kv(kid), result, required); |
294 | 0 | } |
295 | | |
296 | | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
297 | | |
298 | | template<typename T> |
299 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
300 | 0 | const gguf_context * ctx = metadata; |
301 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
302 | |
|
303 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
304 | 0 | if (required) { |
305 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
306 | 0 | } |
307 | 0 | return false; |
308 | 0 | } |
309 | | |
310 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
311 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
312 | |
|
313 | 0 | switch (arr_info.gt) { |
314 | 0 | case GGUF_TYPE_UINT32: |
315 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
316 | 0 | (std::is_same<T, uint32_t>::value)); break; |
317 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
318 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
319 | 0 | default: |
320 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
321 | 0 | } |
322 | | |
323 | 0 | if constexpr (std::is_same<T, std::string>::value) { |
324 | 0 | const size_t n_items = gguf_get_arr_n(ctx, kid); |
325 | 0 | result.clear(); |
326 | |
|
327 | 0 | for (size_t i = 0; i < n_items; i++) { |
328 | 0 | const T value = gguf_get_arr_str(ctx, kid, i); |
329 | 0 | result.emplace_back(value); |
330 | 0 | } |
331 | | } else { |
332 | | result.resize(arr_info.length); |
333 | | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
334 | | } |
335 | |
|
336 | 0 | return true; |
337 | 0 | } |
338 | | |
339 | | template<typename T, size_t N_MAX> |
340 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
341 | 0 | const gguf_context * ctx = metadata; |
342 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
343 | |
|
344 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
345 | 0 | if (required) { |
346 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
347 | 0 | } |
348 | 0 | return false; |
349 | 0 | } |
350 | | |
351 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
352 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
353 | |
|
354 | 0 | switch (arr_info.gt) { |
355 | 0 | case GGUF_TYPE_BOOL: |
356 | 0 | case GGUF_TYPE_UINT32: |
357 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
358 | 0 | (std::is_same<T, uint32_t>::value)); break; |
359 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
360 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
361 | 0 | default: |
362 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
363 | 0 | } |
364 | | |
365 | 0 | if (arr_info.length > N_MAX) { |
366 | 0 | throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
367 | 0 | } |
368 | | |
369 | | if constexpr (std::is_same<T, std::string>::value) { |
370 | | const size_t n_items = gguf_get_arr_n(ctx, kid); |
371 | | |
372 | | for (size_t i = 0; i < n_items; i++) { |
373 | | const T value = gguf_get_arr_str(ctx, kid, i); |
374 | | result[i] = value; |
375 | | } |
376 | 0 | } else { |
377 | 0 | if (arr_info.gt == GGUF_TYPE_BOOL) { |
378 | 0 | const int8_t * values = (const int8_t *) arr_info.data; |
379 | 0 | std::transform(values, values + arr_info.length, result.begin(), [](int8_t x) { |
380 | 0 | return static_cast<T>(x != 0); |
381 | 0 | }); Unexecuted instantiation: llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool)::{lambda(signed char)#1}::operator()(signed char) constUnexecuted instantiation: llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool)::{lambda(signed char)#1}::operator()(signed char) constUnexecuted instantiation: llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool)::{lambda(signed char)#1}::operator()(signed char) const |
382 | 0 | } else { |
383 | 0 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
384 | 0 | } |
385 | 0 | } |
386 | |
|
387 | 0 | return true; |
388 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool) |
389 | | |
390 | | template<typename T> |
391 | 0 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
392 | 0 | return get_arr(llm_kv(kid), result, required); |
393 | 0 | } |
394 | | |
395 | | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
396 | | |
397 | | template<typename T> |
398 | 0 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
399 | 0 | auto it = kv_overrides.find(key); |
400 | |
|
401 | 0 | const struct llama_model_kv_override * override = |
402 | 0 | it != kv_overrides.end() ? &it->second : nullptr; |
403 | |
|
404 | 0 | const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override); |
405 | |
|
406 | 0 | if (required && !found) { |
407 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
408 | 0 | } |
409 | | |
410 | 0 | return found; |
411 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned short>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, bool) |
412 | | |
413 | | template<typename T> |
414 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
415 | 0 | return get_key(llm_kv(kid), result, required); |
416 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(llm_kv, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(llm_kv, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(llm_kv, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(llm_kv, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) |
417 | | |
418 | | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
419 | | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
420 | | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
421 | | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
422 | | |
423 | | template<> |
424 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
425 | 0 | uint32_t tmp; |
426 | 0 | const bool found = get_key(kid, tmp, required); |
427 | 0 | if (found) { |
428 | 0 | result = (enum llama_pooling_type) tmp; |
429 | 0 | } else { |
430 | 0 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
431 | 0 | } |
432 | 0 | return found; |
433 | 0 | } |
434 | | |
435 | | // get array of n <= N_MAX elements, or a single element repeated n times |
436 | | template<typename T, size_t N_MAX> |
437 | 0 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
438 | 0 | const int kid = gguf_find_key(metadata, key.c_str()); |
439 | |
|
440 | 0 | if (kid < 0) { |
441 | 0 | if (required) { |
442 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
443 | 0 | } |
444 | 0 | return false; |
445 | 0 | } |
446 | | |
447 | 0 | if (n > N_MAX) { |
448 | 0 | throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
449 | 0 | } |
450 | | |
451 | 0 | if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) { |
452 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
453 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid); |
454 | |
|
455 | 0 | if (n != arr_info.length) { |
456 | 0 | throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); |
457 | 0 | } |
458 | | |
459 | 0 | return get_arr(key, result, required); |
460 | 0 | } |
461 | | |
462 | 0 | T value; |
463 | |
|
464 | 0 | bool ok = get_key(key, value, required); |
465 | 0 | if (!ok) { |
466 | 0 | return false; |
467 | 0 | } |
468 | | |
469 | 0 | for (uint32_t i = 0; i < n; i++) { |
470 | 0 | result[i] = value; |
471 | 0 | } |
472 | |
|
473 | 0 | return true; |
474 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, unsigned int, bool) |
475 | | |
476 | | template<typename T> |
477 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
478 | 0 | return get_key_or_arr(llm_kv(kid), result, n, required); |
479 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<int, 4ul> >(llm_kv, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<unsigned int, 512ul> >(llm_kv, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<float, 512ul> >(llm_kv, std::__1::array<float, 512ul>&, unsigned int, bool) |
480 | | |
481 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) { |
482 | 0 | const std::string key = llm_kv(kid); |
483 | |
|
484 | 0 | const int id = gguf_find_key(metadata, key.c_str()); |
485 | |
|
486 | 0 | if (id < 0) { |
487 | 0 | if (required) { |
488 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
489 | 0 | } |
490 | 0 | return false; |
491 | 0 | } |
492 | | |
493 | | // throw and error if type is an array |
494 | 0 | if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) { |
495 | 0 | if (required) { |
496 | 0 | throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str())); |
497 | 0 | } |
498 | 0 | return false; |
499 | 0 | } |
500 | | |
501 | 0 | return get_key(key, result, required); |
502 | 0 | } |
503 | | |
504 | | // TODO: this is not very clever - figure out something better |
505 | | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
506 | | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
507 | | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
508 | | |
509 | | |
510 | | llama_model_loader::llama_model_loader( |
511 | | struct gguf_context * meta, |
512 | | llama_model_set_tensor_data_t set_tensor_data, |
513 | | void * set_tensor_data_ud, |
514 | | const std::string & fname, |
515 | | std::vector<std::string> & splits, |
516 | | FILE * file, |
517 | | bool use_mmap, |
518 | | bool use_direct_io, |
519 | | bool check_tensors, |
520 | | bool no_alloc, |
521 | | const llama_model_kv_override * param_overrides_p, |
522 | | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) |
523 | 0 | : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) { |
524 | 0 | int trace = 0; |
525 | 0 | if (getenv("LLAMA_TRACE")) { |
526 | 0 | trace = atoi(getenv("LLAMA_TRACE")); |
527 | 0 | } |
528 | |
|
529 | 0 | if (param_overrides_p != nullptr) { |
530 | 0 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
531 | 0 | kv_overrides.insert({std::string(p->key), *p}); |
532 | 0 | } |
533 | 0 | } |
534 | |
|
535 | 0 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
536 | |
|
537 | 0 | if (!fname.empty()) { |
538 | | // Load the main GGUF |
539 | 0 | struct ggml_context * ctx = NULL; |
540 | 0 | struct gguf_init_params params = { |
541 | 0 | /*.no_alloc = */ true, |
542 | 0 | /*.ctx = */ &ctx, |
543 | 0 | }; |
544 | |
|
545 | 0 | metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params)); |
546 | 0 | metadata = metadata_ptr.get(); |
547 | 0 | if (metadata == nullptr) { |
548 | 0 | throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); |
549 | 0 | } |
550 | | |
551 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
552 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
553 | |
|
554 | 0 | files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); |
555 | 0 | contexts.emplace_back(ctx); |
556 | |
|
557 | 0 | if (use_mmap && use_direct_io) { |
558 | 0 | if (files.back()->has_direct_io()) { |
559 | 0 | LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); |
560 | 0 | use_mmap = false; |
561 | 0 | } else { |
562 | 0 | LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); |
563 | 0 | use_direct_io = false; |
564 | | |
565 | | // reopen file using std::fopen for mmap |
566 | 0 | files.pop_back(); |
567 | 0 | files.emplace_back(new llama_file(fname.c_str(), "rb", false)); |
568 | 0 | } |
569 | 0 | } |
570 | | |
571 | | // Save tensors data offset of the main file. |
572 | | // For subsidiary files, `meta` tensor data offset must not be used, |
573 | | // so we build a unified tensors index for weights. |
574 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
575 | 0 | std::string tensor_name = std::string(cur->name); |
576 | | // make sure there is no duplicated tensor names |
577 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
578 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
579 | 0 | } |
580 | 0 | n_elements += ggml_nelements(cur); |
581 | 0 | n_bytes += ggml_nbytes(cur); |
582 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); |
583 | 0 | } |
584 | 0 | uint16_t n_split = 0; |
585 | 0 | get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); |
586 | | |
587 | | // Load additional GGML contexts |
588 | 0 | if (n_split > 1) { |
589 | | // make sure the main file is loaded first |
590 | 0 | uint16_t idx = 0; |
591 | 0 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
592 | 0 | get_key(kv_split_no, idx); |
593 | 0 | if (idx != 0) { |
594 | 0 | throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); |
595 | 0 | } |
596 | | |
597 | | // generate list of splits if needed |
598 | 0 | if (splits.empty()) { |
599 | 0 | splits = llama_get_list_splits(fname, idx, n_split); |
600 | 0 | } |
601 | | |
602 | | // in case user give a custom list of splits, check if it matches the expected number |
603 | 0 | if (n_split != (uint16_t)splits.size()) { |
604 | 0 | throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); |
605 | 0 | } |
606 | | |
607 | 0 | if (trace > 0) { |
608 | 0 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); |
609 | 0 | } |
610 | | |
611 | | // load other splits |
612 | 0 | for (idx = 1; idx < n_split; idx++) { |
613 | 0 | const char * fname_split = splits[idx].c_str(); |
614 | |
|
615 | 0 | struct gguf_init_params split_params = { |
616 | 0 | /*.no_alloc = */ true, |
617 | 0 | /*.ctx = */ &ctx, |
618 | 0 | }; |
619 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; |
620 | 0 | if (!ctx_gguf) { |
621 | 0 | throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); |
622 | 0 | } |
623 | | |
624 | | // check idx |
625 | 0 | { |
626 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); |
627 | 0 | if (kid < 0) { |
628 | 0 | throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); |
629 | 0 | } |
630 | 0 | int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); |
631 | 0 | if (idx_gguf != idx) { |
632 | 0 | throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | 0 | files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); |
637 | 0 | contexts.emplace_back(ctx); |
638 | | |
639 | | // Save tensors data offset info of the shard. |
640 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
641 | 0 | std::string tensor_name = std::string(cur->name); |
642 | | // make sure there is no duplicated tensor names |
643 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
644 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
645 | 0 | } |
646 | 0 | n_elements += ggml_nelements(cur); |
647 | 0 | n_bytes += ggml_nbytes(cur); |
648 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | 0 | get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); |
653 | | |
654 | | // sanity check |
655 | 0 | { |
656 | 0 | const int n_tensors_loaded = (int) weights_map.size(); |
657 | 0 | if (n_tensors != n_tensors_loaded) { |
658 | 0 | throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); |
659 | 0 | } |
660 | 0 | } |
661 | | |
662 | 0 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); |
663 | 0 | } |
664 | 0 | } else if (file != nullptr) { |
665 | 0 | struct ggml_context * ctx = NULL; |
666 | 0 | struct gguf_init_params params = { |
667 | 0 | /*.no_alloc = */ true, |
668 | 0 | /*.ctx = */ &ctx, |
669 | 0 | }; |
670 | |
|
671 | 0 | metadata_ptr.reset(gguf_init_from_file_ptr(file, params)); |
672 | 0 | metadata = metadata_ptr.get(); |
673 | 0 | if (metadata == nullptr) { |
674 | 0 | throw std::runtime_error(format("%s: failed to load model from file pointer", __func__)); |
675 | 0 | } |
676 | | |
677 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
678 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
679 | |
|
680 | 0 | files.emplace_back(new llama_file(file)); |
681 | 0 | contexts.emplace_back(ctx); |
682 | | |
683 | | // Save tensors data offset info of the main file. |
684 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
685 | 0 | std::string tensor_name = std::string(cur->name); |
686 | | // make sure there is no duplicated tensor names |
687 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
688 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
689 | 0 | } |
690 | 0 | n_elements += ggml_nelements(cur); |
691 | 0 | n_bytes += ggml_nbytes(cur); |
692 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); |
693 | 0 | } |
694 | 0 | } else { |
695 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
696 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
697 | 0 | } |
698 | | |
699 | 0 | n_kv = gguf_get_n_kv(metadata); |
700 | 0 | n_tensors = weights_map.size(); |
701 | |
|
702 | 0 | fver = (enum llama_fver) gguf_get_version(metadata); |
703 | |
|
704 | 0 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", |
705 | 0 | __func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver)); |
706 | | |
707 | | // determine file type based on the number of tensors for each quantization and print meta data |
708 | | // TODO: make optional |
709 | 0 | { |
710 | 0 | std::map<enum ggml_type, uint32_t> n_type; |
711 | |
|
712 | 0 | uint32_t n_type_max = 0; |
713 | 0 | enum ggml_type type_max = GGML_TYPE_F32; |
714 | |
|
715 | 0 | for (const auto & it : weights_map) { |
716 | 0 | const llama_tensor_weight & w = it.second; |
717 | 0 | const ggml_tensor * tensor = w.tensor; |
718 | |
|
719 | 0 | enum ggml_type type = tensor->type; |
720 | |
|
721 | 0 | n_type[type]++; |
722 | |
|
723 | 0 | if (n_type_max < n_type[type]) { |
724 | 0 | n_type_max = n_type[type]; |
725 | 0 | type_max = type; |
726 | 0 | } |
727 | |
|
728 | 0 | if (trace > 0) { |
729 | 0 | const uint16_t sid = w.idx; |
730 | 0 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, |
731 | 0 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
732 | 0 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
733 | 0 | } |
734 | 0 | } |
735 | |
|
736 | 0 | switch (type_max) { |
737 | 0 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
738 | 0 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
739 | 0 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
740 | 0 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
741 | 0 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
742 | 0 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
743 | 0 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
744 | 0 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
745 | 0 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
746 | 0 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
747 | 0 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
748 | 0 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
749 | 0 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
750 | 0 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
751 | 0 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
752 | 0 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
753 | 0 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
754 | 0 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
755 | 0 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
756 | 0 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
757 | 0 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
758 | 0 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
759 | 0 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
760 | 0 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
761 | 0 | case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break; |
762 | 0 | case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break; |
763 | 0 | default: |
764 | 0 | { |
765 | 0 | LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); |
766 | 0 | ftype = LLAMA_FTYPE_ALL_F32; |
767 | 0 | } break; |
768 | 0 | } |
769 | | |
770 | | // this is a way to mark that we have "guessed" the file type |
771 | 0 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
772 | |
|
773 | 0 | { |
774 | 0 | uint32_t ftype_val = 0; |
775 | 0 | if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { |
776 | 0 | ftype = (llama_ftype) ftype_val; |
777 | 0 | } |
778 | 0 | } |
779 | |
|
780 | 0 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); |
781 | |
|
782 | 0 | for (int i = 0; i < n_kv; i++) { |
783 | 0 | const char * name = gguf_get_key(metadata, i); |
784 | 0 | const enum gguf_type type = gguf_get_kv_type(metadata, i); |
785 | 0 | const std::string type_name = |
786 | 0 | type == GGUF_TYPE_ARRAY |
787 | 0 | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i)) |
788 | 0 | : gguf_type_name(type); |
789 | |
|
790 | 0 | std::string value = gguf_kv_to_str(metadata, i); |
791 | 0 | const size_t MAX_VALUE_LEN = 40; |
792 | 0 | if (value.size() > MAX_VALUE_LEN) { |
793 | 0 | value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); |
794 | 0 | } |
795 | 0 | replace_all(value, "\n", "\\n"); |
796 | |
|
797 | 0 | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); |
798 | 0 | } |
799 | | |
800 | | // print type counts |
801 | 0 | for (auto & kv : n_type) { |
802 | 0 | if (kv.second == 0) { |
803 | 0 | continue; |
804 | 0 | } |
805 | | |
806 | 0 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); |
807 | 0 | } |
808 | 0 | } |
809 | | |
810 | 0 | if (!llama_mmap::SUPPORTED) { |
811 | 0 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); |
812 | 0 | use_mmap = false; |
813 | 0 | } |
814 | |
|
815 | 0 | this->use_mmap = use_mmap; |
816 | 0 | this->use_direct_io = use_direct_io; |
817 | 0 | this->check_tensors = check_tensors; |
818 | 0 | this->no_alloc = no_alloc; |
819 | 0 | } |
820 | | |
821 | 0 | std::string llama_model_loader::get_arch_name() const { |
822 | 0 | return arch_name; |
823 | 0 | } |
824 | | |
825 | 0 | enum llm_arch llama_model_loader::get_arch() const { |
826 | 0 | return llm_kv.arch; |
827 | 0 | } |
828 | | |
829 | 0 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
830 | 0 | auto pos = weights_map.find(name); |
831 | 0 | if (pos != weights_map.end()) { |
832 | 0 | return &pos->second; |
833 | 0 | } |
834 | | |
835 | 0 | return nullptr; |
836 | 0 | } |
837 | | |
838 | 0 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
839 | 0 | const llama_tensor_weight * weight = get_weight(name); |
840 | 0 | if (!weight) { |
841 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); |
842 | 0 | } |
843 | 0 | return *weight; |
844 | 0 | } |
845 | | |
846 | 0 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
847 | 0 | const auto * weight = get_weight(name); |
848 | 0 | if (!weight) { |
849 | 0 | return nullptr; |
850 | 0 | } |
851 | 0 | return weight->tensor; |
852 | 0 | } |
853 | | |
854 | 0 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
855 | 0 | struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); |
856 | 0 | if (!tensor) { |
857 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
858 | 0 | } |
859 | 0 | return tensor; |
860 | 0 | } |
861 | | |
862 | 0 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
863 | 0 | const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); |
864 | |
|
865 | 0 | if (cur == NULL) { |
866 | 0 | if (!required) { |
867 | 0 | return NULL; |
868 | 0 | } |
869 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
870 | 0 | } |
871 | | |
872 | 0 | { |
873 | 0 | bool is_ok = true; |
874 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
875 | 0 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
876 | 0 | is_ok = false; |
877 | 0 | break; |
878 | 0 | } |
879 | 0 | } |
880 | 0 | if (!is_ok) { |
881 | 0 | throw std::runtime_error( |
882 | 0 | format("%s: tensor '%s' has wrong shape; expected %s, got %s", |
883 | 0 | __func__, name.c_str(), |
884 | 0 | llama_format_tensor_shape(ne).c_str(), |
885 | 0 | llama_format_tensor_shape(cur).c_str())); |
886 | 0 | } |
887 | 0 | } |
888 | | |
889 | 0 | return cur; |
890 | 0 | } |
891 | | |
892 | | // checks if the weight tensor can be used with the specified buffer type and device |
893 | 0 | static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) { |
894 | 0 | GGML_ASSERT(w != nullptr); |
895 | |
|
896 | 0 | if (op == GGML_OP_NONE) { |
897 | 0 | return true; |
898 | 0 | } |
899 | | |
900 | 0 | ggml_init_params params = { |
901 | 0 | /*.mem_size =*/ ggml_tensor_overhead()*8, |
902 | 0 | /*.mem_buffer =*/ NULL, |
903 | 0 | /*.no_alloc =*/ true, |
904 | 0 | }; |
905 | 0 | ggml_context_ptr ctx_ptr { ggml_init(params) }; |
906 | 0 | if (!ctx_ptr) { |
907 | 0 | throw std::runtime_error(format("failed to create ggml context")); |
908 | 0 | } |
909 | 0 | ggml_context * ctx = ctx_ptr.get(); |
910 | |
|
911 | 0 | ggml_tensor * op_tensor = nullptr; |
912 | |
|
913 | 0 | switch (op) { |
914 | 0 | case GGML_OP_GET_ROWS: |
915 | 0 | { |
916 | 0 | ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512); |
917 | 0 | op_tensor = ggml_get_rows(ctx, w, b); |
918 | 0 | } break; |
919 | 0 | case GGML_OP_MUL_MAT: |
920 | 0 | { |
921 | 0 | ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]); |
922 | 0 | op_tensor = ggml_mul_mat(ctx, w, b); |
923 | 0 | } break; |
924 | 0 | case GGML_OP_MUL_MAT_ID: |
925 | 0 | { |
926 | 0 | const int n_expert_used = hparams.n_expert_used; |
927 | 0 | GGML_ASSERT(n_expert_used > 0); |
928 | 0 | ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512); |
929 | 0 | ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512); |
930 | 0 | op_tensor = ggml_mul_mat_id(ctx, w, b, ids); |
931 | 0 | } break; |
932 | 0 | case GGML_OP_ADD: |
933 | 0 | { |
934 | 0 | ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); |
935 | 0 | op_tensor = ggml_add(ctx, a, w); |
936 | 0 | } break; |
937 | 0 | case GGML_OP_ADD_ID: |
938 | 0 | { |
939 | 0 | const int n_expert_used = hparams.n_expert_used; |
940 | 0 | GGML_ASSERT(n_expert_used > 0); |
941 | 0 | ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512); |
942 | 0 | ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512); |
943 | 0 | op_tensor = ggml_add_id(ctx, a, w, c); |
944 | 0 | } break; |
945 | 0 | case GGML_OP_MUL: |
946 | 0 | { |
947 | 0 | ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); |
948 | 0 | op_tensor = ggml_mul(ctx, a, w); |
949 | 0 | } break; |
950 | 0 | case GGML_OP_DIV: |
951 | 0 | { |
952 | 0 | ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]); |
953 | 0 | op_tensor = ggml_div(ctx, a, w); |
954 | 0 | } break; |
955 | 0 | case GGML_OP_ROPE: |
956 | 0 | { |
957 | 0 | const int n_embd_head = hparams.n_embd_head_v(); |
958 | 0 | const int n_head = hparams.n_head(); |
959 | 0 | ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512); |
960 | 0 | ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512); |
961 | 0 | op_tensor = ggml_rope_ext( |
962 | 0 | ctx, a, b, w, |
963 | 0 | 0, 0, 0, 0, 0, |
964 | 0 | 0, 0, 0, 0 |
965 | 0 | ); |
966 | |
|
967 | 0 | } break; |
968 | 0 | case GGML_OP_SSM_CONV: |
969 | 0 | { |
970 | 0 | const int64_t n_seq_tokens = 512; |
971 | 0 | const int64_t n_seqs = 3; |
972 | 0 | ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs); |
973 | 0 | op_tensor = ggml_ssm_conv(ctx, conv_x, w); |
974 | 0 | } break; |
975 | 0 | case GGML_OP_SSM_SCAN: |
976 | 0 | { |
977 | | // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2 |
978 | 0 | const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0]; |
979 | 0 | const int64_t n_head = w->ne[1]; |
980 | 0 | const int64_t head_dim = hparams.ssm_d_inner / n_head; |
981 | 0 | const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1; |
982 | 0 | const int64_t n_seq_tokens = 512; |
983 | 0 | const int64_t n_seqs = 3; |
984 | 0 | ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs); |
985 | 0 | ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs); |
986 | 0 | ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs); |
987 | 0 | ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
988 | 0 | ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
989 | 0 | ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs); |
990 | 0 | op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids); |
991 | 0 | } break; |
992 | 0 | case GGML_OP_RWKV_WKV6: |
993 | 0 | { |
994 | | // FIXME |
995 | 0 | const int64_t S = 123; |
996 | 0 | const int64_t H = 123; |
997 | 0 | const int64_t n_tokens = 123; |
998 | 0 | const int64_t n_seqs = 123; |
999 | 0 | ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
1000 | 0 | ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
1001 | 0 | ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
1002 | 0 | ggml_tensor * tf = w; |
1003 | 0 | ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens); |
1004 | 0 | ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); |
1005 | 0 | op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); |
1006 | 0 | } break; |
1007 | 0 | case GGML_OP_IM2COL: |
1008 | 0 | { |
1009 | 0 | const int n_embd_inp = hparams.n_embd_inp(); |
1010 | 0 | ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1); |
1011 | 0 | op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); |
1012 | 0 | } break; |
1013 | 0 | case GGML_OP_SCALE: |
1014 | 0 | { |
1015 | 0 | op_tensor = ggml_scale(ctx, w, 1.0f); |
1016 | 0 | } break; |
1017 | 0 | default: |
1018 | 0 | GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); |
1019 | 0 | } |
1020 | | |
1021 | | // create a temporary dummy buffer for the weight so that supports_op can check the buffer type |
1022 | 0 | GGML_ASSERT(w->buffer == nullptr); |
1023 | 0 | w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); |
1024 | 0 | bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); |
1025 | 0 | ggml_backend_buffer_free(w->buffer); |
1026 | 0 | w->buffer = nullptr; |
1027 | |
|
1028 | 0 | return op_supported; |
1029 | 0 | } |
1030 | | |
1031 | | // find the first buffer type in the list that can use the tensor |
1032 | 0 | static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) { |
1033 | 0 | GGML_ASSERT(!buft_list->empty()); |
1034 | 0 | for (const auto & cur : *buft_list) { |
1035 | 0 | ggml_backend_dev_t cur_dev = cur.first; |
1036 | 0 | ggml_backend_buffer_type_t cur_buft = cur.second; |
1037 | 0 | if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) { |
1038 | 0 | return cur_buft; |
1039 | 0 | } |
1040 | 0 | } |
1041 | | |
1042 | 0 | return nullptr; |
1043 | 0 | } |
1044 | | |
1045 | | struct ggml_tensor * llama_model_loader::create_tensor( |
1046 | | const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output, |
1047 | 0 | const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) { |
1048 | 0 | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
1049 | 0 | auto it = ctx_map.find(buft); |
1050 | 0 | if (it == ctx_map.end()) { |
1051 | | // one ggml context per buffer type |
1052 | 0 | int max_n_tensors = n_tensors; |
1053 | 0 | max_n_tensors += 1; // duplicated output tensor |
1054 | 0 | max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors |
1055 | 0 | if (files.empty()) { |
1056 | 0 | max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses |
1057 | 0 | } |
1058 | 0 | const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; |
1059 | |
|
1060 | 0 | ggml_init_params params = { |
1061 | 0 | /*.mem_size =*/ ctx_size, |
1062 | 0 | /*.mem_buffer =*/ NULL, |
1063 | 0 | /*.no_alloc =*/ true, |
1064 | 0 | }; |
1065 | |
|
1066 | 0 | ggml_context * ctx = ggml_init(params); |
1067 | 0 | if (!ctx) { |
1068 | 0 | throw std::runtime_error(format("failed to create ggml context")); |
1069 | 0 | } |
1070 | | |
1071 | 0 | ctx_map.emplace(buft, ctx); |
1072 | |
|
1073 | 0 | return ctx; |
1074 | 0 | } |
1075 | 0 | return it->second.get(); |
1076 | 0 | }; |
1077 | |
|
1078 | 0 | auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t { |
1079 | 0 | if (!t_meta) { |
1080 | 0 | if (flags & TENSOR_NOT_REQUIRED) { |
1081 | 0 | return nullptr; |
1082 | 0 | } |
1083 | 0 | throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); |
1084 | 0 | } |
1085 | | |
1086 | | // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops |
1087 | | // the tensor is duplicated |
1088 | | // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor |
1089 | 0 | llm_tensor tn_tensor = tn.tensor; |
1090 | 0 | if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) { |
1091 | 0 | tn_tensor = LLM_TENSOR_OUTPUT; |
1092 | 0 | } |
1093 | |
|
1094 | 0 | llm_tensor_info info; |
1095 | 0 | try { |
1096 | 0 | info = llm_tensor_info_for(tn_tensor); |
1097 | 0 | } catch (const std::out_of_range & e) { |
1098 | 0 | throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); |
1099 | 0 | } |
1100 | | |
1101 | | // skip unused tensors |
1102 | 0 | if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) { |
1103 | 0 | const size_t nbytes = ggml_nbytes(t_meta); |
1104 | 0 | LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes); |
1105 | |
|
1106 | 0 | size_data -= nbytes; |
1107 | 0 | n_created++; |
1108 | |
|
1109 | 0 | return nullptr; |
1110 | 0 | } |
1111 | | |
1112 | | // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID |
1113 | 0 | ggml_op op; |
1114 | 0 | bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0; |
1115 | 0 | if (bias) { |
1116 | 0 | if (info.op == GGML_OP_MUL_MAT_ID) { |
1117 | 0 | op = GGML_OP_ADD_ID; |
1118 | 0 | } else { |
1119 | 0 | op = GGML_OP_ADD; |
1120 | 0 | } |
1121 | 0 | } else { |
1122 | 0 | op = info.op; |
1123 | 0 | } |
1124 | | |
1125 | | // sanity checks |
1126 | 0 | if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) { |
1127 | 0 | if (tn.bid != -1) { |
1128 | 0 | GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str()); |
1129 | 0 | } |
1130 | 0 | } else { |
1131 | 0 | if (tn.bid == -1) { |
1132 | 0 | GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str()); |
1133 | 0 | } |
1134 | 0 | } |
1135 | | |
1136 | | // select the buffer type for this tensor |
1137 | 0 | const buft_list_t * buft_list; |
1138 | 0 | switch (info.layer) { |
1139 | 0 | case LLM_TENSOR_LAYER_INPUT: |
1140 | 0 | buft_list = buft_list_input; |
1141 | 0 | break; |
1142 | 0 | case LLM_TENSOR_LAYER_OUTPUT: |
1143 | 0 | buft_list = buft_list_output; |
1144 | 0 | break; |
1145 | 0 | case LLM_TENSOR_LAYER_REPEATING: |
1146 | 0 | GGML_ASSERT(buft_list_layer != nullptr); |
1147 | 0 | buft_list = buft_list_layer; |
1148 | 0 | break; |
1149 | 0 | default: |
1150 | 0 | GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); |
1151 | 0 | } |
1152 | | |
1153 | 0 | ggml_backend_buffer_type_t buft = nullptr; |
1154 | | |
1155 | | // check overrides |
1156 | 0 | if (tensor_buft_overrides) { |
1157 | 0 | std::string tensor_name = tn.str(); |
1158 | 0 | for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { |
1159 | 0 | std::regex pattern(overrides->pattern); |
1160 | 0 | if (std::regex_search(tensor_name, pattern)) { |
1161 | 0 | if (overrides->buft == ggml_backend_cpu_buffer_type()) { |
1162 | | // when overriding to a CPU buffer, consider the extra buffer types |
1163 | 0 | buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu); |
1164 | 0 | if (use_mmap) { |
1165 | 0 | static std::once_flag once; |
1166 | 0 | std::call_once(once, [] { |
1167 | 0 | LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n"); |
1168 | 0 | }); |
1169 | 0 | } |
1170 | 0 | } else { |
1171 | 0 | buft = overrides->buft; |
1172 | 0 | } |
1173 | |
|
1174 | 0 | LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", |
1175 | 0 | tensor_name.c_str(), |
1176 | 0 | ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), |
1177 | 0 | ggml_backend_buft_name(buft)); |
1178 | 0 | break; |
1179 | 0 | } |
1180 | 0 | } |
1181 | 0 | } |
1182 | |
|
1183 | 0 | if (!buft) { |
1184 | 0 | buft = select_weight_buft(hparams, t_meta, op, buft_list); |
1185 | 0 | if (!buft) { |
1186 | 0 | throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); |
1187 | 0 | } |
1188 | 0 | } |
1189 | | |
1190 | | // avoid using a host buffer when using mmap |
1191 | 0 | auto * buft_dev = ggml_backend_buft_get_device(buft); |
1192 | 0 | if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { |
1193 | 0 | auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1194 | 0 | if (!cpu_dev) { |
1195 | 0 | throw std::runtime_error("no CPU backend found"); |
1196 | 0 | } |
1197 | 0 | buft = ggml_backend_dev_buffer_type(cpu_dev); |
1198 | 0 | } |
1199 | | |
1200 | 0 | if (buft != buft_list->front().second) { |
1201 | 0 | if (n_tensors_moved == 0) { |
1202 | 0 | first_tensor_moved_name = t_meta->name; |
1203 | 0 | first_tensor_moved_type_name = ggml_type_name(t_meta->type); |
1204 | 0 | first_moved_from_buft = buft_list->front().second; |
1205 | 0 | first_moved_to_buft = buft; |
1206 | 0 | } |
1207 | 0 | n_tensors_moved++; |
1208 | 0 | } |
1209 | |
|
1210 | 0 | return buft; |
1211 | 0 | }; |
1212 | |
|
1213 | 0 | if (files.empty()) { |
1214 | 0 | if (flags & TENSOR_SKIP_IF_VIRTUAL) { |
1215 | 0 | return nullptr; |
1216 | 0 | } |
1217 | 0 | ggml_type type = GGML_TYPE_F32; |
1218 | 0 | const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str()); |
1219 | 0 | if (tid != -1) { |
1220 | 0 | type = gguf_get_tensor_type(metadata, tid); |
1221 | 0 | } |
1222 | | |
1223 | | // for tensors that are not required some of the dimensions can be invalid: |
1224 | 0 | if (flags & TENSOR_NOT_REQUIRED) { |
1225 | 0 | for (size_t dim = 0; dim < ne.size(); dim++) { |
1226 | 0 | if (ne.begin()[dim] <= 0) { |
1227 | 0 | return nullptr; |
1228 | 0 | } |
1229 | 0 | } |
1230 | 0 | } |
1231 | | |
1232 | 0 | ggml_tensor t_meta; |
1233 | 0 | memset(&t_meta, 0, sizeof(ggml_tensor)); |
1234 | 0 | t_meta.type = type; |
1235 | 0 | for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) { |
1236 | 0 | t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1; |
1237 | 0 | GGML_ASSERT(t_meta.ne[dim] >= 1); |
1238 | 0 | t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1]; |
1239 | 0 | GGML_ASSERT(t_meta.nb[dim] >= 1); |
1240 | 0 | } |
1241 | 0 | ggml_set_name(&t_meta, tn.str().c_str()); |
1242 | |
|
1243 | 0 | ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta); |
1244 | 0 | GGML_ASSERT(buft != nullptr); |
1245 | 0 | ggml_context * ctx = ctx_for_buft(buft); |
1246 | 0 | ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta); |
1247 | 0 | ggml_set_name(ret, tn.str().c_str()); |
1248 | 0 | return ret; |
1249 | 0 | } |
1250 | | |
1251 | 0 | ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str()); |
1252 | 0 | ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta); |
1253 | 0 | if (buft == nullptr) { |
1254 | 0 | return nullptr; // return type is ggml_tensor * |
1255 | 0 | } |
1256 | 0 | ggml_context * ctx = ctx_for_buft(buft); |
1257 | | |
1258 | | // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one |
1259 | 0 | if (flags & TENSOR_DUPLICATED) { |
1260 | 0 | ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); |
1261 | 0 | if (t) { |
1262 | 0 | return t; |
1263 | 0 | } |
1264 | 0 | } |
1265 | | |
1266 | 0 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str()); |
1267 | 0 | const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED)); |
1268 | |
|
1269 | 0 | if (cur == NULL) { |
1270 | 0 | return NULL; |
1271 | 0 | } |
1272 | | |
1273 | 0 | const bool duplicated = flags & TENSOR_DUPLICATED; |
1274 | |
|
1275 | 0 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); |
1276 | 0 | ggml_set_name(tensor, ggml_get_name(cur)); |
1277 | |
|
1278 | 0 | if (duplicated) { |
1279 | 0 | size_data += ggml_nbytes(cur); |
1280 | 0 | } else { |
1281 | 0 | n_created++; |
1282 | 0 | } |
1283 | |
|
1284 | 0 | return tensor; |
1285 | 0 | } |
1286 | | |
1287 | 0 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
1288 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
1289 | |
|
1290 | 0 | if (cur == NULL) { |
1291 | 0 | return NULL; |
1292 | 0 | } |
1293 | | |
1294 | 0 | if (cur->type != base->type) { |
1295 | 0 | throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); |
1296 | 0 | } |
1297 | | |
1298 | 0 | std::array<int64_t, GGML_MAX_DIMS> dims; |
1299 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
1300 | 0 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
1301 | 0 | } |
1302 | |
|
1303 | 0 | struct ggml_tensor * tensor = ggml_view_4d(ctx, base, |
1304 | 0 | dims[0], dims[1], dims[2], dims[3], |
1305 | 0 | cur->nb[1], cur->nb[2], cur->nb[3], |
1306 | 0 | offset); |
1307 | |
|
1308 | 0 | ggml_set_name(tensor, name.c_str()); |
1309 | |
|
1310 | 0 | n_created++; |
1311 | |
|
1312 | 0 | return tensor; |
1313 | 0 | } |
1314 | | |
1315 | 0 | void llama_model_loader::done_getting_tensors() const { |
1316 | 0 | if (n_created != n_tensors) { |
1317 | 0 | throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); |
1318 | 0 | } |
1319 | 0 | if (n_tensors_moved > 0) { |
1320 | 0 | LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n", |
1321 | 0 | __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1, |
1322 | 0 | ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); |
1323 | 0 | } |
1324 | 0 | } |
1325 | | |
1326 | 0 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
1327 | 0 | if (use_mmap) { |
1328 | 0 | mappings.reserve(files.size()); |
1329 | 0 | mmaps_used.reserve(files.size()); |
1330 | 0 | for (const auto & file : files) { |
1331 | 0 | bool is_numa = false; |
1332 | |
|
1333 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1334 | 0 | if (dev) { |
1335 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
1336 | 0 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); |
1337 | 0 | if (is_numa_fn) { |
1338 | 0 | is_numa = is_numa_fn(); |
1339 | 0 | } |
1340 | 0 | } |
1341 | |
|
1342 | 0 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa); |
1343 | 0 | mmaps_used.emplace_back(mapping->size(), 0); |
1344 | 0 | if (mlock_mmaps) { |
1345 | 0 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
1346 | 0 | mlock_mmap->init(mapping->addr()); |
1347 | 0 | mlock_mmaps->emplace_back(std::move(mlock_mmap)); |
1348 | 0 | } |
1349 | 0 | mappings.emplace_back(std::move(mapping)); |
1350 | 0 | } |
1351 | 0 | } |
1352 | | |
1353 | | // compute the total size of all tensors for progress reporting |
1354 | 0 | for (const auto & it : weights_map) { |
1355 | 0 | size_data += ggml_nbytes(it.second.tensor); |
1356 | 0 | } |
1357 | 0 | } |
1358 | | |
1359 | 0 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
1360 | 0 | GGML_ASSERT(!mappings.empty()); |
1361 | 0 | const auto & mapping = mappings.at(idx); |
1362 | |
|
1363 | 0 | *first = mapping->size(); |
1364 | 0 | *last = 0; |
1365 | 0 | *addr = mapping->addr(); |
1366 | 0 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
1367 | 0 | const auto * weight = get_weight(ggml_get_name(tensor)); |
1368 | 0 | if (!weight || weight->idx != idx) { |
1369 | 0 | continue; |
1370 | 0 | } |
1371 | 0 | *first = std::min(*first, weight->offs); |
1372 | 0 | *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); |
1373 | 0 | } |
1374 | 0 | } |
1375 | | |
1376 | 0 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
1377 | 0 | const auto & w = require_weight(ggml_get_name(cur)); |
1378 | |
|
1379 | 0 | if (use_mmap) { |
1380 | 0 | const auto & mapping = mappings.at(w.idx); |
1381 | 0 | if (cur->data == nullptr) { |
1382 | 0 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
1383 | 0 | } else { |
1384 | 0 | memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); |
1385 | 0 | } |
1386 | 0 | } else { |
1387 | 0 | GGML_ASSERT(cur->data != nullptr); |
1388 | 0 | GGML_ASSERT(w.idx < files.size()); |
1389 | 0 | const auto & file = files.at(w.idx); |
1390 | 0 | file->seek(w.offs, SEEK_SET); |
1391 | 0 | file->read_raw(cur->data, ggml_nbytes(cur)); |
1392 | 0 | } |
1393 | |
|
1394 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { |
1395 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1396 | 0 | } |
1397 | 0 | } |
1398 | | |
1399 | | bool llama_model_loader::load_all_data( |
1400 | | struct ggml_context * ctx, |
1401 | | llama_buf_map & bufs, |
1402 | | llama_mlocks * lmlocks, |
1403 | | llama_progress_callback progress_callback, |
1404 | 0 | void * progress_callback_user_data) { |
1405 | 0 | if (files.empty()) { |
1406 | 0 | for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { |
1407 | 0 | set_tensor_data(t, set_tensor_data_ud); |
1408 | 0 | } |
1409 | 0 | return true; |
1410 | 0 | } |
1411 | 0 | GGML_ASSERT(size_data != 0 && "call init_mappings() first"); |
1412 | |
|
1413 | 0 | std::vector<no_init<uint8_t>> read_buf; |
1414 | 0 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
1415 | | |
1416 | | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
1417 | | // NVMe raid configurations might require more / larger buffers. |
1418 | 0 | constexpr size_t n_buffers = 4; |
1419 | |
|
1420 | 0 | size_t alignment = 1; |
1421 | 0 | for (const auto & file : files) { |
1422 | 0 | alignment = std::max(file->read_alignment(), alignment); |
1423 | 0 | } |
1424 | | |
1425 | | // Buffer size: balance between memory usage and I/O efficiency |
1426 | | // 64MB works well for NVMe drives |
1427 | 0 | const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; |
1428 | |
|
1429 | 0 | std::vector<ggml_backend_buffer_t> host_buffers; |
1430 | 0 | std::vector<ggml_backend_event_t> events; |
1431 | 0 | std::vector<void *> host_ptrs; |
1432 | 0 | size_t buffer_idx = 0; // buffer to use for async loads |
1433 | 0 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
1434 | 0 | if (use_mmap || check_tensors) { |
1435 | 0 | return nullptr; |
1436 | 0 | } |
1437 | | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
1438 | | // First determine if the backend supports the necessary features for async uploads. |
1439 | 0 | auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; |
1440 | 0 | if (!buf) { |
1441 | 0 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); |
1442 | 0 | return nullptr; |
1443 | 0 | } |
1444 | | |
1445 | 0 | auto * buft = ggml_backend_buffer_get_type(buf); |
1446 | 0 | auto * dev = ggml_backend_buft_get_device(buft); |
1447 | 0 | if (!dev) { |
1448 | 0 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, |
1449 | 0 | ggml_backend_buft_name(buft)); |
1450 | 0 | return nullptr; |
1451 | 0 | } |
1452 | | |
1453 | 0 | if (buft != ggml_backend_dev_buffer_type(dev)) { |
1454 | 0 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, |
1455 | 0 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
1456 | 0 | return nullptr; |
1457 | 0 | } |
1458 | | |
1459 | 0 | ggml_backend_dev_props props; |
1460 | 0 | ggml_backend_dev_get_props(dev, &props); |
1461 | 0 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
1462 | 0 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, |
1463 | 0 | ggml_backend_dev_name(dev)); |
1464 | 0 | return nullptr; |
1465 | 0 | } |
1466 | | |
1467 | 0 | auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
1468 | 0 | if (!host_buft) { |
1469 | 0 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, |
1470 | 0 | ggml_backend_dev_name(dev)); |
1471 | 0 | return nullptr; |
1472 | 0 | } |
1473 | | |
1474 | | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
1475 | 0 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
1476 | 0 | auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); |
1477 | |
|
1478 | 0 | if (!buf) { |
1479 | 0 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, |
1480 | 0 | ggml_backend_dev_name(dev)); |
1481 | 0 | return nullptr; |
1482 | 0 | } |
1483 | | |
1484 | 0 | host_buffers.emplace_back(buf); |
1485 | 0 | host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); |
1486 | |
|
1487 | 0 | auto * event = ggml_backend_event_new(dev); |
1488 | 0 | if (!event) { |
1489 | 0 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, |
1490 | 0 | ggml_backend_dev_name(dev)); |
1491 | 0 | return nullptr; |
1492 | 0 | } |
1493 | | |
1494 | 0 | events.emplace_back(event); |
1495 | 0 | } |
1496 | | |
1497 | 0 | ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
1498 | 0 | if (!backend) { |
1499 | 0 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, |
1500 | 0 | ggml_backend_dev_name(dev)); |
1501 | 0 | return nullptr; |
1502 | 0 | } |
1503 | | |
1504 | 0 | return backend; |
1505 | 0 | }(__func__); |
1506 | |
|
1507 | 0 | if (upload_backend) { |
1508 | 0 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, |
1509 | 0 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
1510 | 0 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
1511 | 0 | ggml_backend_name(upload_backend)); |
1512 | 0 | } |
1513 | |
|
1514 | 0 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { |
1515 | 0 | const auto * weight = get_weight(ggml_get_name(cur)); |
1516 | 0 | if (weight == nullptr) { |
1517 | | // this can happen with split experts models |
1518 | 0 | continue; |
1519 | 0 | } |
1520 | | |
1521 | 0 | if (progress_callback) { |
1522 | 0 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
1523 | 0 | return false; |
1524 | 0 | } |
1525 | 0 | } |
1526 | | |
1527 | 0 | size_t n_size = ggml_nbytes(cur); |
1528 | |
|
1529 | 0 | if (use_mmap) { |
1530 | 0 | const auto & mapping = mappings.at(weight->idx); |
1531 | 0 | ggml_backend_buffer_t buf_mmap = nullptr; |
1532 | 0 | if (bufs.count(weight->idx)) { |
1533 | 0 | buf_mmap = bufs.at(weight->idx); |
1534 | 0 | } |
1535 | 0 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
1536 | |
|
1537 | 0 | if (check_tensors) { |
1538 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { |
1539 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); |
1540 | 0 | })); |
1541 | 0 | } |
1542 | |
|
1543 | 0 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
1544 | 0 | if (buf_mmap && cur->data == nullptr) { |
1545 | 0 | ggml_backend_tensor_alloc(buf_mmap, cur, data); |
1546 | 0 | if (lmlocks) { |
1547 | 0 | const auto & lmlock = lmlocks->at(weight->idx); |
1548 | 0 | lmlock->grow_to(weight->offs + n_size); |
1549 | 0 | } |
1550 | |
|
1551 | 0 | auto & mmap_used = mmaps_used[weight->idx]; |
1552 | 0 | mmap_used.first = std::min(mmap_used.first, weight->offs); |
1553 | 0 | mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); |
1554 | 0 | } else { |
1555 | 0 | ggml_backend_tensor_set(cur, data, 0, n_size); |
1556 | 0 | } |
1557 | 0 | } else { |
1558 | 0 | const auto & file = files.at(weight->idx); |
1559 | |
|
1560 | 0 | if (ggml_backend_buffer_is_host(cur->buffer)) { |
1561 | 0 | file->seek(weight->offs, SEEK_SET); |
1562 | 0 | file->read_raw(cur->data, n_size); |
1563 | 0 | if (check_tensors) { |
1564 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { |
1565 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); |
1566 | 0 | })); |
1567 | 0 | } |
1568 | 0 | } else { |
1569 | | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
1570 | 0 | if (upload_backend) { |
1571 | 0 | size_t offset = weight->offs; |
1572 | 0 | alignment = file->read_alignment(); |
1573 | 0 | size_t aligned_offset = offset & ~(alignment - 1); |
1574 | 0 | size_t offset_from_alignment = offset - aligned_offset; |
1575 | 0 | file->seek(aligned_offset, SEEK_SET); |
1576 | | |
1577 | | // Calculate aligned read boundaries |
1578 | 0 | size_t read_start = aligned_offset; |
1579 | 0 | size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); |
1580 | |
|
1581 | 0 | size_t bytes_read = 0; |
1582 | 0 | size_t data_read = 0; // Actual tensor data copied (excluding padding) |
1583 | |
|
1584 | 0 | while (bytes_read < read_end - read_start) { |
1585 | 0 | size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read); |
1586 | | |
1587 | | // Align the destination pointer within the pinned buffer |
1588 | 0 | uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); |
1589 | | |
1590 | | // Wait for previous upload to complete before reusing buffer |
1591 | 0 | ggml_backend_event_synchronize(events[buffer_idx]); |
1592 | | |
1593 | | // Read aligned chunk from file |
1594 | 0 | file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size); |
1595 | | |
1596 | | // Calculate actual data portion (excluding alignment padding) |
1597 | 0 | uintptr_t ptr_data = ptr_dest_aligned; |
1598 | 0 | size_t data_to_copy = read_size; |
1599 | | |
1600 | | // Skip alignment padding at start of first chunk |
1601 | 0 | if (bytes_read == 0) { |
1602 | 0 | ptr_data += offset_from_alignment; |
1603 | 0 | data_to_copy -= offset_from_alignment; |
1604 | 0 | } |
1605 | | |
1606 | | // Trim alignment padding at end of last chunk |
1607 | 0 | if (aligned_offset + bytes_read + read_size > offset + n_size) { |
1608 | 0 | data_to_copy -= (read_end - (offset + n_size)); |
1609 | 0 | } |
1610 | | |
1611 | | // Async upload actual data to GPU |
1612 | 0 | ggml_backend_tensor_set_async(upload_backend, cur, |
1613 | 0 | reinterpret_cast<void *>(ptr_data), data_read, data_to_copy); |
1614 | 0 | ggml_backend_event_record(events[buffer_idx], upload_backend); |
1615 | |
|
1616 | 0 | data_read += data_to_copy; |
1617 | 0 | bytes_read += read_size; |
1618 | |
|
1619 | 0 | ++buffer_idx; |
1620 | 0 | buffer_idx %= n_buffers; |
1621 | 0 | } |
1622 | 0 | } else { |
1623 | 0 | read_buf.resize(n_size); |
1624 | 0 | file->seek(weight->offs, SEEK_SET); |
1625 | 0 | file->read_raw(read_buf.data(), n_size); |
1626 | 0 | ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); |
1627 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { |
1628 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1629 | 0 | } |
1630 | 0 | } |
1631 | 0 | } |
1632 | 0 | } |
1633 | | |
1634 | 0 | size_done += n_size; |
1635 | 0 | } |
1636 | | |
1637 | | // free temporary resources used for async uploads |
1638 | 0 | for (auto * event : events) { |
1639 | 0 | ggml_backend_event_synchronize(event); |
1640 | 0 | ggml_backend_event_free(event); |
1641 | 0 | } |
1642 | 0 | for (auto * buf : host_buffers) { |
1643 | 0 | ggml_backend_buffer_free(buf); |
1644 | 0 | } |
1645 | 0 | ggml_backend_free(upload_backend); |
1646 | | |
1647 | | // check validation results |
1648 | 0 | bool validation_failed = false; |
1649 | 0 | for (auto & future : validation_result) { |
1650 | 0 | auto result = future.get(); |
1651 | 0 | if (!result.second) { |
1652 | 0 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); |
1653 | 0 | validation_failed = true; |
1654 | 0 | } |
1655 | 0 | } |
1656 | 0 | if (validation_failed) { |
1657 | 0 | throw std::runtime_error("found tensors with invalid data"); |
1658 | 0 | } |
1659 | | |
1660 | | // check if this is the last call and do final cleanup |
1661 | 0 | if (size_done >= size_data) { |
1662 | | // unmap offloaded tensors and metadata |
1663 | 0 | if (use_mmap) { |
1664 | 0 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
1665 | 0 | const auto & mmap_used = mmaps_used.at(idx); |
1666 | 0 | auto & mapping = mappings.at(idx); |
1667 | 0 | mapping->unmap_fragment(0, mmap_used.first); |
1668 | 0 | if (mmap_used.second != 0) { |
1669 | 0 | mapping->unmap_fragment(mmap_used.second, mapping->size()); |
1670 | 0 | } |
1671 | 0 | } |
1672 | 0 | } |
1673 | 0 | if (progress_callback) { |
1674 | | // Even though the model is done loading, we still honor |
1675 | | // cancellation since we need to free allocations. |
1676 | 0 | return progress_callback(1.0f, progress_callback_user_data); |
1677 | 0 | } |
1678 | 0 | } |
1679 | | |
1680 | 0 | return true; |
1681 | 0 | } |
1682 | | |
1683 | 0 | std::string llama_model_loader::ftype_name() const { |
1684 | 0 | return llama_model_ftype_name(ftype); |
1685 | 0 | } |
1686 | | |
1687 | 0 | void llama_model_loader::print_info() const { |
1688 | 0 | LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); |
1689 | 0 | LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); |
1690 | 0 | if (n_bytes < GiB) { |
1691 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
1692 | 0 | } else { |
1693 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
1694 | 0 | } |
1695 | 0 | } |