/src/llama.cpp/src/llama-model-loader.cpp
Line | Count | Source |
1 | | #include "llama-model-loader.h" |
2 | | |
3 | | #include "ggml.h" |
4 | | |
5 | | #include <array> |
6 | | #include <cinttypes> |
7 | | #include <cstring> |
8 | | #include <future> |
9 | | |
10 | | static const size_t kiB = 1024; |
11 | | static const size_t MiB = 1024*kiB; |
12 | | static const size_t GiB = 1024*MiB; |
13 | | |
14 | 0 | const char * llama_file_version_name(llama_fver version) { |
15 | 0 | switch (version) { |
16 | 0 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; |
17 | 0 | case GGUF_FILE_VERSION_V2: return "GGUF V2"; |
18 | 0 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; |
19 | 0 | } |
20 | | |
21 | 0 | return "unknown"; |
22 | 0 | } |
23 | | |
24 | 0 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
25 | 0 | if (ftype & LLAMA_FTYPE_GUESSED) { |
26 | 0 | return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; |
27 | 0 | } |
28 | | |
29 | 0 | switch (ftype) { |
30 | 0 | case LLAMA_FTYPE_ALL_F32: return "all F32"; |
31 | 0 | case LLAMA_FTYPE_MOSTLY_F16: return "F16"; |
32 | 0 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |
33 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |
34 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |
35 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |
36 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |
37 | 0 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |
38 | 0 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; |
39 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; |
40 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |
41 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |
42 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |
43 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |
44 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |
45 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |
46 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |
47 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |
48 | 0 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; |
49 | 0 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; |
50 | 0 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; |
51 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; |
52 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |
53 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; |
54 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; |
55 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |
56 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; |
57 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; |
58 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; |
59 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |
60 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |
61 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; |
62 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; |
63 | | |
64 | 0 | default: return "unknown, may not work"; |
65 | 0 | } |
66 | 0 | } |
67 | | |
68 | | // return a list of splits for a given path |
69 | | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
70 | 0 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
71 | 0 | std::vector<std::string> paths; |
72 | 0 | std::string split_prefix; |
73 | 0 | std::vector<char> buf(llama_path_max(), 0); |
74 | |
|
75 | 0 | { |
76 | 0 | int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); |
77 | 0 | if (!ret) { |
78 | 0 | throw std::runtime_error(format("invalid split file name: %s", path.c_str())); |
79 | 0 | } |
80 | 0 | split_prefix = std::string(buf.data(), ret); |
81 | 0 | } |
82 | | |
83 | 0 | if (split_prefix.empty()) { |
84 | 0 | throw std::runtime_error(format("invalid split file: %s", path.c_str())); |
85 | 0 | } |
86 | | |
87 | 0 | for (int idx = 0; idx < n_split; ++idx) { |
88 | 0 | int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); |
89 | 0 | paths.push_back(std::string(buf.data(), ret)); |
90 | 0 | } |
91 | |
|
92 | 0 | return paths; |
93 | 0 | } |
94 | | |
95 | | namespace GGUFMeta { |
96 | | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
97 | | struct GKV_Base_Type { |
98 | | static constexpr gguf_type gt = gt_; |
99 | | |
100 | 0 | static T getter(const gguf_context * ctx, const int kid) { |
101 | 0 | return gfun(ctx, kid); |
102 | 0 | } Unexecuted instantiation: GGUFMeta::GKV_Base_Type<bool, (gguf_type)7, &gguf_get_val_bool>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<float, (gguf_type)6, &gguf_get_val_f32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned int, (gguf_type)4, &gguf_get_val_u32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned short, (gguf_type)2, &gguf_get_val_u16>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<int, (gguf_type)5, &gguf_get_val_i32>::getter(gguf_context const*, int) |
103 | | }; |
104 | | |
105 | | template<typename T> struct GKV_Base; |
106 | | |
107 | | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
108 | | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
109 | | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
110 | | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
111 | | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
112 | | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
113 | | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
114 | | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
115 | | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
116 | | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
117 | | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
118 | | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
119 | | |
120 | | template<> struct GKV_Base<std::string> { |
121 | | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
122 | | |
123 | 0 | static std::string getter(const gguf_context * ctx, const int kid) { |
124 | 0 | return gguf_get_val_str(ctx, kid); |
125 | 0 | } |
126 | | }; |
127 | | |
128 | | struct ArrayInfo { |
129 | | const gguf_type gt; |
130 | | const size_t length; |
131 | | const void * data; |
132 | | }; |
133 | | |
134 | | template<> struct GKV_Base<ArrayInfo> { |
135 | | public: |
136 | | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
137 | 0 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
138 | 0 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); |
139 | 0 | return ArrayInfo { |
140 | 0 | arr_type, |
141 | 0 | size_t(gguf_get_arr_n(ctx, k)), |
142 | 0 | arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), |
143 | 0 | }; |
144 | 0 | } |
145 | | }; |
146 | | |
147 | | template<typename T> |
148 | | class GKV : public GKV_Base<T> { |
149 | | GKV() = delete; |
150 | | |
151 | | public: |
152 | 0 | static T get_kv(const gguf_context * ctx, const int k) { |
153 | 0 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); |
154 | |
|
155 | 0 | if (kt != GKV::gt) { |
156 | 0 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", |
157 | 0 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); |
158 | 0 | } |
159 | 0 | return GKV::getter(ctx, k); |
160 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<bool>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<float>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<int>::get_kv(gguf_context const*, int) |
161 | | |
162 | 0 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
163 | 0 | switch (ty) { |
164 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; |
165 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; |
166 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; |
167 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; |
168 | 0 | } |
169 | 0 | return "unknown"; |
170 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<float>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<int>::override_type_to_str(llama_model_kv_override_type) |
171 | | |
172 | 0 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
173 | 0 | if (!ovrd) { return false; } |
174 | 0 | if (ovrd->tag == expected_type) { |
175 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", |
176 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
177 | 0 | switch (ovrd->tag) { |
178 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
179 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); |
180 | 0 | } break; |
181 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
182 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); |
183 | 0 | } break; |
184 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
185 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); |
186 | 0 | } break; |
187 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
188 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); |
189 | 0 | } break; |
190 | 0 | default: |
191 | | // Shouldn't be possible to end up here, but just in case... |
192 | 0 | throw std::runtime_error( |
193 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", |
194 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); |
195 | 0 | } |
196 | 0 | return true; |
197 | 0 | } |
198 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", |
199 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
200 | 0 | return false; |
201 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) |
202 | | |
203 | | template<typename OT> |
204 | | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
205 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
206 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
207 | 0 | target = ovrd->val_bool; |
208 | 0 | return true; |
209 | 0 | } |
210 | 0 | return false; |
211 | 0 | } |
212 | | |
213 | | template<typename OT> |
214 | | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
215 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
216 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
217 | 0 | target = ovrd->val_i64; |
218 | 0 | return true; |
219 | 0 | } |
220 | 0 | return false; |
221 | 0 | } Unexecuted instantiation: _ZN8GGUFMeta3GKVIjE12try_overrideIjEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVItE12try_overrideItEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Unexecuted instantiation: _ZN8GGUFMeta3GKVIiE12try_overrideIiEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override |
222 | | |
223 | | template<typename OT> |
224 | | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
225 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
226 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
227 | 0 | target = ovrd->val_f64; |
228 | 0 | return true; |
229 | 0 | } |
230 | 0 | return false; |
231 | 0 | } |
232 | | |
233 | | template<typename OT> |
234 | | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
235 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
236 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
237 | 0 | target = ovrd->val_str; |
238 | 0 | return true; |
239 | 0 | } |
240 | 0 | return false; |
241 | 0 | } |
242 | | |
243 | 0 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
244 | 0 | if (try_override<T>(target, ovrd)) { |
245 | 0 | return true; |
246 | 0 | } |
247 | 0 | if (k < 0) { return false; } |
248 | 0 | target = get_kv(ctx, k); |
249 | 0 | return true; |
250 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, int, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, int, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, int, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, int, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, int, int&, llama_model_kv_override const*) |
251 | | |
252 | 0 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
253 | 0 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
254 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, char const*, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, char const*, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, char const*, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, char const*, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, char const*, int&, llama_model_kv_override const*) |
255 | | |
256 | 0 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
257 | 0 | return set(ctx, key.c_str(), target, ovrd); |
258 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, llama_model_kv_override const*) |
259 | | }; |
260 | | } |
261 | | |
262 | | template<typename T> |
263 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
264 | 0 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
265 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
266 | |
|
267 | 0 | if (kid < 0) { |
268 | 0 | if (required) { |
269 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
270 | 0 | } |
271 | 0 | return false; |
272 | 0 | } |
273 | | |
274 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
275 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
276 | | |
277 | |
|
278 | 0 | result = arr_info.length; |
279 | 0 | return true; |
280 | 0 | } |
281 | | |
282 | | template<typename T> |
283 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
284 | 0 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
285 | 0 | return get_arr_n(llm_kv(kid), result, required); |
286 | 0 | } |
287 | | |
288 | | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
289 | | |
290 | | template<typename T> |
291 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
292 | 0 | const gguf_context * ctx = meta.get(); |
293 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
294 | |
|
295 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
296 | 0 | if (required) { |
297 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
298 | 0 | } |
299 | 0 | return false; |
300 | 0 | } |
301 | | |
302 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
303 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
304 | |
|
305 | 0 | switch (arr_info.gt) { |
306 | 0 | case GGUF_TYPE_UINT32: |
307 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
308 | 0 | (std::is_same<T, uint32_t>::value)); break; |
309 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
310 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
311 | 0 | default: |
312 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
313 | 0 | } |
314 | | |
315 | 0 | if constexpr (std::is_same<T, std::string>::value) { |
316 | 0 | const size_t n_items = gguf_get_arr_n(ctx, kid); |
317 | 0 | result.clear(); |
318 | |
|
319 | 0 | for (size_t i = 0; i < n_items; i++) { |
320 | 0 | const T value = gguf_get_arr_str(ctx, kid, i); |
321 | 0 | result.emplace_back(value); |
322 | 0 | } |
323 | | } else { |
324 | | result.resize(arr_info.length); |
325 | | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
326 | | } |
327 | |
|
328 | 0 | return true; |
329 | 0 | } |
330 | | |
331 | | template<typename T, size_t N_MAX> |
332 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
333 | 0 | const gguf_context * ctx = meta.get(); |
334 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
335 | |
|
336 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
337 | 0 | if (required) { |
338 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
339 | 0 | } |
340 | 0 | return false; |
341 | 0 | } |
342 | | |
343 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
344 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
345 | |
|
346 | 0 | switch (arr_info.gt) { |
347 | 0 | case GGUF_TYPE_UINT32: |
348 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
349 | 0 | (std::is_same<T, uint32_t>::value)); break; |
350 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
351 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
352 | 0 | default: |
353 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
354 | 0 | } |
355 | | |
356 | 0 | if (arr_info.length > N_MAX) { |
357 | 0 | throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
358 | 0 | } |
359 | | |
360 | | if constexpr (std::is_same<T, std::string>::value) { |
361 | | const size_t n_items = gguf_get_arr_n(ctx, kid); |
362 | | |
363 | | for (size_t i = 0; i < n_items; i++) { |
364 | | const T value = gguf_get_arr_str(ctx, kid, i); |
365 | | result[i] = value; |
366 | | } |
367 | 0 | } else { |
368 | 0 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
369 | 0 | } |
370 | |
|
371 | 0 | return true; |
372 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool) |
373 | | |
374 | | template<typename T> |
375 | 0 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
376 | 0 | return get_arr(llm_kv(kid), result, required); |
377 | 0 | } |
378 | | |
379 | | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
380 | | |
381 | | template<typename T> |
382 | 0 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
383 | 0 | auto it = kv_overrides.find(key); |
384 | |
|
385 | 0 | const struct llama_model_kv_override * override = |
386 | 0 | it != kv_overrides.end() ? &it->second : nullptr; |
387 | |
|
388 | 0 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); |
389 | |
|
390 | 0 | if (required && !found) { |
391 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
392 | 0 | } |
393 | | |
394 | 0 | return found; |
395 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned short>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, bool) |
396 | | |
397 | | template<typename T> |
398 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
399 | 0 | return get_key(llm_kv(kid), result, required); |
400 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(llm_kv, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(llm_kv, float&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<unsigned int>(llm_kv, unsigned int&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(llm_kv, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) |
401 | | |
402 | | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
403 | | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
404 | | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
405 | | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
406 | | |
407 | | template<> |
408 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
409 | 0 | uint32_t tmp; |
410 | 0 | const bool found = get_key(kid, tmp, required); |
411 | 0 | if (found) { |
412 | 0 | result = (enum llama_pooling_type) tmp; |
413 | 0 | } else { |
414 | 0 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
415 | 0 | } |
416 | 0 | return found; |
417 | 0 | } |
418 | | |
419 | | // get array of n <= N_MAX elements, or a single element repeated n times |
420 | | template<typename T, size_t N_MAX> |
421 | 0 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
422 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
423 | |
|
424 | 0 | if (kid < 0) { |
425 | 0 | if (required) { |
426 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
427 | 0 | } |
428 | 0 | return false; |
429 | 0 | } |
430 | | |
431 | 0 | if (n > N_MAX) { |
432 | 0 | throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
433 | 0 | } |
434 | | |
435 | 0 | if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { |
436 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
437 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
438 | |
|
439 | 0 | if (n != arr_info.length) { |
440 | 0 | throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); |
441 | 0 | } |
442 | | |
443 | 0 | return get_arr(key, result, required); |
444 | 0 | } |
445 | | |
446 | 0 | T value; |
447 | |
|
448 | 0 | bool ok = get_key(key, value, required); |
449 | 0 | if (!ok) { |
450 | 0 | return false; |
451 | 0 | } |
452 | | |
453 | 0 | for (uint32_t i = 0; i < n; i++) { |
454 | 0 | result[i] = value; |
455 | 0 | } |
456 | |
|
457 | 0 | return true; |
458 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, unsigned int, bool) |
459 | | |
460 | | template<typename T> |
461 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
462 | 0 | return get_key_or_arr(llm_kv(kid), result, n, required); |
463 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<int, 4ul> >(llm_kv, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<unsigned int, 512ul> >(llm_kv, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<float, 512ul> >(llm_kv, std::__1::array<float, 512ul>&, unsigned int, bool) |
464 | | |
465 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) { |
466 | 0 | const std::string key = llm_kv(kid); |
467 | |
|
468 | 0 | const int id = gguf_find_key(meta.get(), key.c_str()); |
469 | |
|
470 | 0 | if (id < 0) { |
471 | 0 | if (required) { |
472 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
473 | 0 | } |
474 | 0 | return false; |
475 | 0 | } |
476 | | |
477 | | // throw and error if type is an array |
478 | 0 | if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) { |
479 | 0 | if (required) { |
480 | 0 | throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str())); |
481 | 0 | } |
482 | 0 | return false; |
483 | 0 | } |
484 | | |
485 | 0 | return get_key(key, result, required); |
486 | 0 | } |
487 | | |
488 | | // TODO: this is not very clever - figure out something better |
489 | | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
490 | | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
491 | | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
492 | | |
493 | | |
494 | | llama_model_loader::llama_model_loader( |
495 | | const std::string & fname, |
496 | | std::vector<std::string> & splits, |
497 | | bool use_mmap, |
498 | | bool check_tensors, |
499 | | bool no_alloc, |
500 | | const llama_model_kv_override * param_overrides_p, |
501 | 0 | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { |
502 | 0 | int trace = 0; |
503 | 0 | if (getenv("LLAMA_TRACE")) { |
504 | 0 | trace = atoi(getenv("LLAMA_TRACE")); |
505 | 0 | } |
506 | |
|
507 | 0 | if (param_overrides_p != nullptr) { |
508 | 0 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
509 | 0 | kv_overrides.insert({std::string(p->key), *p}); |
510 | 0 | } |
511 | 0 | } |
512 | |
|
513 | 0 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
514 | | |
515 | | // Load the main GGUF |
516 | 0 | struct ggml_context * ctx = NULL; |
517 | 0 | struct gguf_init_params params = { |
518 | 0 | /*.no_alloc = */ true, |
519 | 0 | /*.ctx = */ &ctx, |
520 | 0 | }; |
521 | |
|
522 | 0 | meta.reset(gguf_init_from_file(fname.c_str(), params)); |
523 | 0 | if (!meta) { |
524 | 0 | throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); |
525 | 0 | } |
526 | | |
527 | 0 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
528 | 0 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
529 | |
|
530 | 0 | files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); |
531 | 0 | contexts.emplace_back(ctx); |
532 | | |
533 | | // Save tensors data offset of the main file. |
534 | | // For subsidiary files, `meta` tensor data offset must not be used, |
535 | | // so we build a unified tensors index for weights. |
536 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
537 | 0 | std::string tensor_name = std::string(cur->name); |
538 | | // make sure there is no duplicated tensor names |
539 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
540 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
541 | 0 | } |
542 | 0 | n_elements += ggml_nelements(cur); |
543 | 0 | n_bytes += ggml_nbytes(cur); |
544 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); |
545 | 0 | } |
546 | 0 | uint16_t n_split = 0; |
547 | 0 | get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); |
548 | | |
549 | | // Load additional GGML contexts |
550 | 0 | if (n_split > 1) { |
551 | | // make sure the main file is loaded first |
552 | 0 | uint16_t idx = 0; |
553 | 0 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
554 | 0 | get_key(kv_split_no, idx); |
555 | 0 | if (idx != 0) { |
556 | 0 | throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); |
557 | 0 | } |
558 | | |
559 | | // generate list of splits if needed |
560 | 0 | if (splits.empty()) { |
561 | 0 | splits = llama_get_list_splits(fname, idx, n_split); |
562 | 0 | } |
563 | | |
564 | | // in case user give a custom list of splits, check if it matches the expected number |
565 | 0 | if (n_split != (uint16_t)splits.size()) { |
566 | 0 | throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); |
567 | 0 | } |
568 | | |
569 | 0 | if (trace > 0) { |
570 | 0 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); |
571 | 0 | } |
572 | | |
573 | | // load other splits |
574 | 0 | for (idx = 1; idx < n_split; idx++) { |
575 | 0 | const char * fname_split = splits[idx].c_str(); |
576 | |
|
577 | 0 | struct gguf_init_params split_params = { |
578 | 0 | /*.no_alloc = */ true, |
579 | 0 | /*.ctx = */ &ctx, |
580 | 0 | }; |
581 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; |
582 | 0 | if (!ctx_gguf) { |
583 | 0 | throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); |
584 | 0 | } |
585 | | |
586 | | // check idx |
587 | 0 | { |
588 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); |
589 | 0 | if (kid < 0) { |
590 | 0 | throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); |
591 | 0 | } |
592 | 0 | int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); |
593 | 0 | if (idx_gguf != idx) { |
594 | 0 | throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); |
595 | 0 | } |
596 | 0 | } |
597 | | |
598 | 0 | files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); |
599 | 0 | contexts.emplace_back(ctx); |
600 | | |
601 | | // Save tensors data offset info of the shard. |
602 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
603 | 0 | std::string tensor_name = std::string(cur->name); |
604 | | // make sure there is no duplicated tensor names |
605 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
606 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
607 | 0 | } |
608 | 0 | n_elements += ggml_nelements(cur); |
609 | 0 | n_bytes += ggml_nbytes(cur); |
610 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
611 | 0 | } |
612 | 0 | } |
613 | | |
614 | 0 | get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); |
615 | | |
616 | | // sanity check |
617 | 0 | { |
618 | 0 | const int n_tensors_loaded = (int) weights_map.size(); |
619 | 0 | if (n_tensors != n_tensors_loaded) { |
620 | 0 | throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); |
621 | 0 | } |
622 | 0 | } |
623 | | |
624 | 0 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); |
625 | 0 | } |
626 | | |
627 | 0 | n_kv = gguf_get_n_kv(meta.get()); |
628 | 0 | n_tensors = weights_map.size(); |
629 | |
|
630 | 0 | fver = (enum llama_fver) gguf_get_version(meta.get()); |
631 | |
|
632 | 0 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", |
633 | 0 | __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); |
634 | | |
635 | | // determine file type based on the number of tensors for each quantization and print meta data |
636 | | // TODO: make optional |
637 | 0 | { |
638 | 0 | std::map<enum ggml_type, uint32_t> n_type; |
639 | |
|
640 | 0 | uint32_t n_type_max = 0; |
641 | 0 | enum ggml_type type_max = GGML_TYPE_F32; |
642 | |
|
643 | 0 | for (const auto & it : weights_map) { |
644 | 0 | const llama_tensor_weight & w = it.second; |
645 | 0 | const ggml_tensor * tensor = w.tensor; |
646 | |
|
647 | 0 | enum ggml_type type = tensor->type; |
648 | |
|
649 | 0 | n_type[type]++; |
650 | |
|
651 | 0 | if (n_type_max < n_type[type]) { |
652 | 0 | n_type_max = n_type[type]; |
653 | 0 | type_max = type; |
654 | 0 | } |
655 | |
|
656 | 0 | if (trace > 0) { |
657 | 0 | const uint16_t sid = w.idx; |
658 | 0 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, |
659 | 0 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
660 | 0 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
661 | 0 | } |
662 | 0 | } |
663 | |
|
664 | 0 | switch (type_max) { |
665 | 0 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
666 | 0 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
667 | 0 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
668 | 0 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
669 | 0 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
670 | 0 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
671 | 0 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
672 | 0 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
673 | 0 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
674 | 0 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
675 | 0 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
676 | 0 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
677 | 0 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
678 | 0 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
679 | 0 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
680 | 0 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
681 | 0 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
682 | 0 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
683 | 0 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
684 | 0 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
685 | 0 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
686 | 0 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
687 | 0 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
688 | 0 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
689 | 0 | default: |
690 | 0 | { |
691 | 0 | LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); |
692 | 0 | ftype = LLAMA_FTYPE_ALL_F32; |
693 | 0 | } break; |
694 | 0 | } |
695 | | |
696 | | // this is a way to mark that we have "guessed" the file type |
697 | 0 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
698 | |
|
699 | 0 | { |
700 | 0 | uint32_t ftype_val = 0; |
701 | 0 | if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { |
702 | 0 | ftype = (llama_ftype) ftype_val; |
703 | 0 | } |
704 | 0 | } |
705 | |
|
706 | 0 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); |
707 | |
|
708 | 0 | for (int i = 0; i < n_kv; i++) { |
709 | 0 | const char * name = gguf_get_key(meta.get(), i); |
710 | 0 | const enum gguf_type type = gguf_get_kv_type(meta.get(), i); |
711 | 0 | const std::string type_name = |
712 | 0 | type == GGUF_TYPE_ARRAY |
713 | 0 | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) |
714 | 0 | : gguf_type_name(type); |
715 | |
|
716 | 0 | std::string value = gguf_kv_to_str(meta.get(), i); |
717 | 0 | const size_t MAX_VALUE_LEN = 40; |
718 | 0 | if (value.size() > MAX_VALUE_LEN) { |
719 | 0 | value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); |
720 | 0 | } |
721 | 0 | replace_all(value, "\n", "\\n"); |
722 | |
|
723 | 0 | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); |
724 | 0 | } |
725 | | |
726 | | // print type counts |
727 | 0 | for (auto & kv : n_type) { |
728 | 0 | if (kv.second == 0) { |
729 | 0 | continue; |
730 | 0 | } |
731 | | |
732 | 0 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); |
733 | 0 | } |
734 | 0 | } |
735 | | |
736 | 0 | if (!llama_mmap::SUPPORTED) { |
737 | 0 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); |
738 | 0 | use_mmap = false; |
739 | 0 | } |
740 | |
|
741 | 0 | this->use_mmap = use_mmap; |
742 | 0 | this->check_tensors = check_tensors; |
743 | 0 | this->no_alloc = no_alloc; |
744 | 0 | } |
745 | | |
746 | 0 | std::string llama_model_loader::get_arch_name() const { |
747 | 0 | return arch_name; |
748 | 0 | } |
749 | | |
750 | 0 | enum llm_arch llama_model_loader::get_arch() const { |
751 | 0 | return llm_kv.arch; |
752 | 0 | } |
753 | | |
754 | 0 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
755 | 0 | auto pos = weights_map.find(name); |
756 | 0 | if (pos != weights_map.end()) { |
757 | 0 | return &pos->second; |
758 | 0 | } |
759 | | |
760 | 0 | return nullptr; |
761 | 0 | } |
762 | | |
763 | 0 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
764 | 0 | const llama_tensor_weight * weight = get_weight(name); |
765 | 0 | if (!weight) { |
766 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); |
767 | 0 | } |
768 | 0 | return *weight; |
769 | 0 | } |
770 | | |
771 | 0 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
772 | 0 | const auto * weight = get_weight(name); |
773 | 0 | if (!weight) { |
774 | 0 | return nullptr; |
775 | 0 | } |
776 | 0 | return weight->tensor; |
777 | 0 | } |
778 | | |
779 | 0 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
780 | 0 | struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); |
781 | 0 | if (!tensor) { |
782 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
783 | 0 | } |
784 | 0 | return tensor; |
785 | 0 | } |
786 | | |
787 | 0 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
788 | 0 | const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); |
789 | |
|
790 | 0 | if (cur == NULL) { |
791 | 0 | if (!required) { |
792 | 0 | return NULL; |
793 | 0 | } |
794 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
795 | 0 | } |
796 | | |
797 | 0 | { |
798 | 0 | bool is_ok = true; |
799 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
800 | 0 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
801 | 0 | is_ok = false; |
802 | 0 | break; |
803 | 0 | } |
804 | 0 | } |
805 | 0 | if (!is_ok) { |
806 | 0 | throw std::runtime_error( |
807 | 0 | format("%s: tensor '%s' has wrong shape; expected %s, got %s", |
808 | 0 | __func__, name.c_str(), |
809 | 0 | llama_format_tensor_shape(ne).c_str(), |
810 | 0 | llama_format_tensor_shape(cur).c_str())); |
811 | 0 | } |
812 | 0 | } |
813 | | |
814 | 0 | return cur; |
815 | 0 | } |
816 | | |
817 | 0 | struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) { |
818 | 0 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str()); |
819 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); |
820 | |
|
821 | 0 | if (cur == NULL) { |
822 | 0 | return NULL; |
823 | 0 | } |
824 | | |
825 | 0 | bool duplicated = flags & TENSOR_DUPLICATED; |
826 | |
|
827 | 0 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); |
828 | 0 | ggml_set_name(tensor, ggml_get_name(cur)); |
829 | |
|
830 | 0 | if (duplicated) { |
831 | 0 | size_data += ggml_nbytes(cur); |
832 | 0 | } else { |
833 | 0 | n_created++; |
834 | 0 | } |
835 | |
|
836 | 0 | return tensor; |
837 | |
|
838 | 0 | } |
839 | | |
840 | 0 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
841 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
842 | |
|
843 | 0 | if (cur == NULL) { |
844 | 0 | return NULL; |
845 | 0 | } |
846 | | |
847 | 0 | if (cur->type != base->type) { |
848 | 0 | throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); |
849 | 0 | } |
850 | | |
851 | 0 | std::array<int64_t, GGML_MAX_DIMS> dims; |
852 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
853 | 0 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
854 | 0 | } |
855 | |
|
856 | 0 | struct ggml_tensor * tensor = ggml_view_4d(ctx, base, |
857 | 0 | dims[0], dims[1], dims[2], dims[3], |
858 | 0 | cur->nb[1], cur->nb[2], cur->nb[3], |
859 | 0 | offset); |
860 | |
|
861 | 0 | ggml_set_name(tensor, name.c_str()); |
862 | |
|
863 | 0 | n_created++; |
864 | |
|
865 | 0 | return tensor; |
866 | 0 | } |
867 | | |
868 | 0 | void llama_model_loader::done_getting_tensors() const { |
869 | 0 | if (n_created != n_tensors) { |
870 | 0 | throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); |
871 | 0 | } |
872 | 0 | } |
873 | | |
874 | 0 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
875 | 0 | if (use_mmap) { |
876 | 0 | mappings.reserve(files.size()); |
877 | 0 | mmaps_used.reserve(files.size()); |
878 | 0 | for (const auto & file : files) { |
879 | 0 | bool is_numa = false; |
880 | |
|
881 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
882 | 0 | if (dev) { |
883 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
884 | 0 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); |
885 | 0 | if (is_numa_fn) { |
886 | 0 | is_numa = is_numa_fn(); |
887 | 0 | } |
888 | 0 | } |
889 | |
|
890 | 0 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa); |
891 | 0 | mmaps_used.emplace_back(mapping->size(), 0); |
892 | 0 | if (mlock_mmaps) { |
893 | 0 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
894 | 0 | mlock_mmap->init(mapping->addr()); |
895 | 0 | mlock_mmaps->emplace_back(std::move(mlock_mmap)); |
896 | 0 | } |
897 | 0 | mappings.emplace_back(std::move(mapping)); |
898 | 0 | } |
899 | 0 | } |
900 | | |
901 | | // compute the total size of all tensors for progress reporting |
902 | 0 | for (const auto & it : weights_map) { |
903 | 0 | size_data += ggml_nbytes(it.second.tensor); |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | 0 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
908 | 0 | GGML_ASSERT(!mappings.empty()); |
909 | 0 | const auto & mapping = mappings.at(idx); |
910 | |
|
911 | 0 | *first = mapping->size(); |
912 | 0 | *last = 0; |
913 | 0 | *addr = mapping->addr(); |
914 | 0 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
915 | 0 | const auto * weight = get_weight(ggml_get_name(tensor)); |
916 | 0 | if (!weight || weight->idx != idx) { |
917 | 0 | continue; |
918 | 0 | } |
919 | 0 | *first = std::min(*first, weight->offs); |
920 | 0 | *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | 0 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
925 | 0 | const auto & w = require_weight(ggml_get_name(cur)); |
926 | |
|
927 | 0 | if (use_mmap) { |
928 | 0 | const auto & mapping = mappings.at(w.idx); |
929 | 0 | if (cur->data == nullptr) { |
930 | 0 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
931 | 0 | } else { |
932 | 0 | memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); |
933 | 0 | } |
934 | 0 | } else { |
935 | 0 | GGML_ASSERT(cur->data != nullptr); |
936 | 0 | GGML_ASSERT(w.idx < files.size()); |
937 | 0 | const auto & file = files.at(w.idx); |
938 | 0 | file->seek(w.offs, SEEK_SET); |
939 | 0 | file->read_raw(cur->data, ggml_nbytes(cur)); |
940 | 0 | } |
941 | |
|
942 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { |
943 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
944 | 0 | } |
945 | 0 | } |
946 | | |
947 | | bool llama_model_loader::load_all_data( |
948 | | struct ggml_context * ctx, |
949 | | llama_buf_map & bufs, |
950 | | llama_mlocks * lmlocks, |
951 | | llama_progress_callback progress_callback, |
952 | 0 | void * progress_callback_user_data) { |
953 | 0 | GGML_ASSERT(size_data != 0 && "call init_mappings() first"); |
954 | |
|
955 | 0 | std::vector<no_init<uint8_t>> read_buf; |
956 | 0 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
957 | | |
958 | | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
959 | | // NVMe raid configurations might require more / larger buffers. |
960 | 0 | constexpr size_t n_buffers = 4; |
961 | |
|
962 | 0 | size_t alignment = 1; |
963 | 0 | for (const auto & file : files) { |
964 | 0 | alignment = std::max(file->read_alignment(), alignment); |
965 | 0 | } |
966 | | |
967 | | // Buffer size: balance between memory usage and I/O efficiency |
968 | | // 64MB works well for NVMe drives |
969 | 0 | const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; |
970 | |
|
971 | 0 | std::vector<ggml_backend_buffer_t> host_buffers; |
972 | 0 | std::vector<ggml_backend_event_t> events; |
973 | 0 | std::vector<void *> host_ptrs; |
974 | 0 | size_t buffer_idx = 0; // buffer to use for async loads |
975 | 0 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
976 | 0 | if (use_mmap || check_tensors) { |
977 | 0 | return nullptr; |
978 | 0 | } |
979 | | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
980 | | // First determine if the backend supports the necessary features for async uploads. |
981 | 0 | auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; |
982 | 0 | if (!buf) { |
983 | 0 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); |
984 | 0 | return nullptr; |
985 | 0 | } |
986 | | |
987 | 0 | auto * buft = ggml_backend_buffer_get_type(buf); |
988 | 0 | auto * dev = ggml_backend_buft_get_device(buft); |
989 | 0 | if (!dev) { |
990 | 0 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, |
991 | 0 | ggml_backend_buft_name(buft)); |
992 | 0 | return nullptr; |
993 | 0 | } |
994 | | |
995 | 0 | if (buft != ggml_backend_dev_buffer_type(dev)) { |
996 | 0 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, |
997 | 0 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
998 | 0 | return nullptr; |
999 | 0 | } |
1000 | | |
1001 | 0 | ggml_backend_dev_props props; |
1002 | 0 | ggml_backend_dev_get_props(dev, &props); |
1003 | 0 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
1004 | 0 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, |
1005 | 0 | ggml_backend_dev_name(dev)); |
1006 | 0 | return nullptr; |
1007 | 0 | } |
1008 | | |
1009 | 0 | auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
1010 | 0 | if (!host_buft) { |
1011 | 0 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, |
1012 | 0 | ggml_backend_dev_name(dev)); |
1013 | 0 | return nullptr; |
1014 | 0 | } |
1015 | | |
1016 | | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
1017 | 0 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
1018 | 0 | auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); |
1019 | |
|
1020 | 0 | if (!buf) { |
1021 | 0 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, |
1022 | 0 | ggml_backend_dev_name(dev)); |
1023 | 0 | return nullptr; |
1024 | 0 | } |
1025 | | |
1026 | 0 | host_buffers.emplace_back(buf); |
1027 | 0 | host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); |
1028 | |
|
1029 | 0 | auto * event = ggml_backend_event_new(dev); |
1030 | 0 | if (!event) { |
1031 | 0 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, |
1032 | 0 | ggml_backend_dev_name(dev)); |
1033 | 0 | return nullptr; |
1034 | 0 | } |
1035 | | |
1036 | 0 | events.emplace_back(event); |
1037 | 0 | } |
1038 | | |
1039 | 0 | ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
1040 | 0 | if (!backend) { |
1041 | 0 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, |
1042 | 0 | ggml_backend_dev_name(dev)); |
1043 | 0 | return nullptr; |
1044 | 0 | } |
1045 | | |
1046 | 0 | return backend; |
1047 | 0 | }(__func__); |
1048 | |
|
1049 | 0 | if (upload_backend) { |
1050 | 0 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, |
1051 | 0 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
1052 | 0 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
1053 | 0 | ggml_backend_name(upload_backend)); |
1054 | 0 | } |
1055 | |
|
1056 | 0 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { |
1057 | 0 | const auto * weight = get_weight(ggml_get_name(cur)); |
1058 | 0 | if (weight == nullptr) { |
1059 | | // this can happen with split experts models |
1060 | 0 | continue; |
1061 | 0 | } |
1062 | | |
1063 | 0 | if (progress_callback) { |
1064 | 0 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
1065 | 0 | return false; |
1066 | 0 | } |
1067 | 0 | } |
1068 | | |
1069 | 0 | size_t n_size = ggml_nbytes(cur); |
1070 | |
|
1071 | 0 | if (use_mmap) { |
1072 | 0 | const auto & mapping = mappings.at(weight->idx); |
1073 | 0 | ggml_backend_buffer_t buf_mmap = nullptr; |
1074 | 0 | if (bufs.count(weight->idx)) { |
1075 | 0 | buf_mmap = bufs.at(weight->idx); |
1076 | 0 | } |
1077 | 0 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
1078 | |
|
1079 | 0 | if (check_tensors) { |
1080 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { |
1081 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); |
1082 | 0 | })); |
1083 | 0 | } |
1084 | |
|
1085 | 0 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
1086 | 0 | if (buf_mmap && cur->data == nullptr) { |
1087 | 0 | ggml_backend_tensor_alloc(buf_mmap, cur, data); |
1088 | 0 | if (lmlocks) { |
1089 | 0 | const auto & lmlock = lmlocks->at(weight->idx); |
1090 | 0 | lmlock->grow_to(weight->offs + n_size); |
1091 | 0 | } |
1092 | |
|
1093 | 0 | auto & mmap_used = mmaps_used[weight->idx]; |
1094 | 0 | mmap_used.first = std::min(mmap_used.first, weight->offs); |
1095 | 0 | mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); |
1096 | 0 | } else { |
1097 | 0 | ggml_backend_tensor_set(cur, data, 0, n_size); |
1098 | 0 | } |
1099 | 0 | } else { |
1100 | 0 | const auto & file = files.at(weight->idx); |
1101 | |
|
1102 | 0 | if (ggml_backend_buffer_is_host(cur->buffer)) { |
1103 | 0 | file->read_raw_at(cur->data, n_size, weight->offs); |
1104 | 0 | if (check_tensors) { |
1105 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { |
1106 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); |
1107 | 0 | })); |
1108 | 0 | } |
1109 | 0 | } else { |
1110 | | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
1111 | 0 | if (upload_backend) { |
1112 | 0 | size_t offset = weight->offs; |
1113 | 0 | alignment = file->read_alignment(); |
1114 | 0 | size_t aligned_offset = offset & ~(alignment - 1); |
1115 | 0 | size_t offset_from_alignment = offset - aligned_offset; |
1116 | 0 | file->seek(aligned_offset, SEEK_SET); |
1117 | | |
1118 | | // Calculate aligned read boundaries |
1119 | 0 | size_t read_start = aligned_offset; |
1120 | 0 | size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); |
1121 | |
|
1122 | 0 | size_t bytes_read = 0; |
1123 | 0 | size_t data_read = 0; // Actual tensor data copied (excluding padding) |
1124 | |
|
1125 | 0 | while (bytes_read < read_end - read_start) { |
1126 | 0 | size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read); |
1127 | | |
1128 | | // Align the destination pointer within the pinned buffer |
1129 | 0 | uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); |
1130 | | |
1131 | | // Wait for previous upload to complete before reusing buffer |
1132 | 0 | ggml_backend_event_synchronize(events[buffer_idx]); |
1133 | | |
1134 | | // Read aligned chunk from file |
1135 | 0 | file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size); |
1136 | | |
1137 | | // Calculate actual data portion (excluding alignment padding) |
1138 | 0 | uintptr_t ptr_data = ptr_dest_aligned; |
1139 | 0 | size_t data_to_copy = read_size; |
1140 | | |
1141 | | // Skip alignment padding at start of first chunk |
1142 | 0 | if (bytes_read == 0) { |
1143 | 0 | ptr_data += offset_from_alignment; |
1144 | 0 | data_to_copy -= offset_from_alignment; |
1145 | 0 | } |
1146 | | |
1147 | | // Trim alignment padding at end of last chunk |
1148 | 0 | if (aligned_offset + bytes_read + read_size > offset + n_size) { |
1149 | 0 | data_to_copy -= (read_end - (offset + n_size)); |
1150 | 0 | } |
1151 | | |
1152 | | // Async upload actual data to GPU |
1153 | 0 | ggml_backend_tensor_set_async(upload_backend, cur, |
1154 | 0 | reinterpret_cast<void *>(ptr_data), data_read, data_to_copy); |
1155 | 0 | ggml_backend_event_record(events[buffer_idx], upload_backend); |
1156 | |
|
1157 | 0 | data_read += data_to_copy; |
1158 | 0 | bytes_read += read_size; |
1159 | |
|
1160 | 0 | ++buffer_idx; |
1161 | 0 | buffer_idx %= n_buffers; |
1162 | 0 | } |
1163 | 0 | } else { |
1164 | 0 | read_buf.resize(n_size); |
1165 | 0 | file->read_raw_at(read_buf.data(), n_size, weight->offs); |
1166 | 0 | ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); |
1167 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { |
1168 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1169 | 0 | } |
1170 | 0 | } |
1171 | 0 | } |
1172 | 0 | } |
1173 | | |
1174 | 0 | size_done += n_size; |
1175 | 0 | } |
1176 | | |
1177 | | // free temporary resources used for async uploads |
1178 | 0 | for (auto * event : events) { |
1179 | 0 | ggml_backend_event_synchronize(event); |
1180 | 0 | ggml_backend_event_free(event); |
1181 | 0 | } |
1182 | 0 | for (auto * buf : host_buffers) { |
1183 | 0 | ggml_backend_buffer_free(buf); |
1184 | 0 | } |
1185 | 0 | ggml_backend_free(upload_backend); |
1186 | | |
1187 | | // check validation results |
1188 | 0 | bool validation_failed = false; |
1189 | 0 | for (auto & future : validation_result) { |
1190 | 0 | auto result = future.get(); |
1191 | 0 | if (!result.second) { |
1192 | 0 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); |
1193 | 0 | validation_failed = true; |
1194 | 0 | } |
1195 | 0 | } |
1196 | 0 | if (validation_failed) { |
1197 | 0 | throw std::runtime_error("found tensors with invalid data"); |
1198 | 0 | } |
1199 | | |
1200 | | // check if this is the last call and do final cleanup |
1201 | 0 | if (size_done >= size_data) { |
1202 | | // unmap offloaded tensors and metadata |
1203 | 0 | if (use_mmap) { |
1204 | 0 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
1205 | 0 | const auto & mmap_used = mmaps_used.at(idx); |
1206 | 0 | auto & mapping = mappings.at(idx); |
1207 | 0 | mapping->unmap_fragment(0, mmap_used.first); |
1208 | 0 | if (mmap_used.second != 0) { |
1209 | 0 | mapping->unmap_fragment(mmap_used.second, mapping->size()); |
1210 | 0 | } |
1211 | 0 | } |
1212 | 0 | } |
1213 | 0 | if (progress_callback) { |
1214 | | // Even though the model is done loading, we still honor |
1215 | | // cancellation since we need to free allocations. |
1216 | 0 | return progress_callback(1.0f, progress_callback_user_data); |
1217 | 0 | } |
1218 | 0 | } |
1219 | | |
1220 | 0 | return true; |
1221 | 0 | } |
1222 | | |
1223 | 0 | std::string llama_model_loader::ftype_name() const { |
1224 | 0 | return llama_model_ftype_name(ftype); |
1225 | 0 | } |
1226 | | |
1227 | 0 | void llama_model_loader::print_info() const { |
1228 | 0 | LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); |
1229 | 0 | LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); |
1230 | 0 | if (n_bytes < GiB) { |
1231 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
1232 | 0 | } else { |
1233 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
1234 | 0 | } |
1235 | 0 | } |