/src/llama.cpp/src/llama-model-loader.cpp
Line | Count | Source |
1 | | #include "llama-model-loader.h" |
2 | | |
3 | | #include "ggml.h" |
4 | | |
5 | | #include <algorithm> |
6 | | #include <array> |
7 | | #include <cinttypes> |
8 | | #include <cstring> |
9 | | #include <future> |
10 | | |
11 | | static const size_t kiB = 1024; |
12 | | static const size_t MiB = 1024*kiB; |
13 | | static const size_t GiB = 1024*MiB; |
14 | | |
15 | 180 | const char * llama_file_version_name(llama_fver version) { |
16 | 180 | switch (version) { |
17 | 0 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; |
18 | 10 | case GGUF_FILE_VERSION_V2: return "GGUF V2"; |
19 | 170 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; |
20 | 180 | } |
21 | | |
22 | 0 | return "unknown"; |
23 | 180 | } |
24 | | |
25 | 180 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
26 | 180 | if (ftype & LLAMA_FTYPE_GUESSED) { |
27 | 90 | return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; |
28 | 90 | } |
29 | | |
30 | 90 | switch (ftype) { |
31 | 89 | case LLAMA_FTYPE_ALL_F32: return "all F32"; |
32 | 1 | case LLAMA_FTYPE_MOSTLY_F16: return "F16"; |
33 | 0 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |
34 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |
35 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |
36 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |
37 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |
38 | 0 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |
39 | 0 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; |
40 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; |
41 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |
42 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |
43 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |
44 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |
45 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |
46 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |
47 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |
48 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |
49 | 0 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; |
50 | 0 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; |
51 | 0 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; |
52 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; |
53 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |
54 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; |
55 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; |
56 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |
57 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; |
58 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; |
59 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; |
60 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |
61 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |
62 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; |
63 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; |
64 | | |
65 | 0 | default: return "unknown, may not work"; |
66 | 90 | } |
67 | 90 | } |
68 | | |
69 | | // return a list of splits for a given path |
70 | | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
71 | 0 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
72 | 0 | std::vector<std::string> paths; |
73 | 0 | std::string split_prefix; |
74 | 0 | std::vector<char> buf(llama_path_max(), 0); |
75 | |
|
76 | 0 | { |
77 | 0 | int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); |
78 | 0 | if (!ret) { |
79 | 0 | throw std::runtime_error(format("invalid split file name: %s", path.c_str())); |
80 | 0 | } |
81 | 0 | split_prefix = std::string(buf.data(), ret); |
82 | 0 | } |
83 | | |
84 | 0 | if (split_prefix.empty()) { |
85 | 0 | throw std::runtime_error(format("invalid split file: %s", path.c_str())); |
86 | 0 | } |
87 | | |
88 | 0 | for (int idx = 0; idx < n_split; ++idx) { |
89 | 0 | int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); |
90 | 0 | paths.push_back(std::string(buf.data(), ret)); |
91 | 0 | } |
92 | |
|
93 | 0 | return paths; |
94 | 0 | } |
95 | | |
96 | | namespace GGUFMeta { |
97 | | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
98 | | struct GKV_Base_Type { |
99 | | static constexpr gguf_type gt = gt_; |
100 | | |
101 | 0 | static T getter(const gguf_context * ctx, const int kid) { |
102 | 0 | return gfun(ctx, kid); |
103 | 0 | } Unexecuted instantiation: GGUFMeta::GKV_Base_Type<bool, (gguf_type)7, &gguf_get_val_bool>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<float, (gguf_type)6, &gguf_get_val_f32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned int, (gguf_type)4, &gguf_get_val_u32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned short, (gguf_type)2, &gguf_get_val_u16>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<int, (gguf_type)5, &gguf_get_val_i32>::getter(gguf_context const*, int) |
104 | | }; |
105 | | |
106 | | template<typename T> struct GKV_Base; |
107 | | |
108 | | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
109 | | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
110 | | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
111 | | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
112 | | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
113 | | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
114 | | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
115 | | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
116 | | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
117 | | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
118 | | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
119 | | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
120 | | |
121 | | template<> struct GKV_Base<std::string> { |
122 | | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
123 | | |
124 | 7 | static std::string getter(const gguf_context * ctx, const int kid) { |
125 | 7 | return gguf_get_val_str(ctx, kid); |
126 | 7 | } |
127 | | }; |
128 | | |
129 | | struct ArrayInfo { |
130 | | const gguf_type gt; |
131 | | const size_t length; |
132 | | const void * data; |
133 | | }; |
134 | | |
135 | | template<> struct GKV_Base<ArrayInfo> { |
136 | | public: |
137 | | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
138 | 0 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
139 | 0 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); |
140 | 0 | return ArrayInfo { |
141 | 0 | arr_type, |
142 | 0 | size_t(gguf_get_arr_n(ctx, k)), |
143 | 0 | arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), |
144 | 0 | }; |
145 | 0 | } |
146 | | }; |
147 | | |
148 | | template<typename T> |
149 | | class GKV : public GKV_Base<T> { |
150 | | GKV() = delete; |
151 | | |
152 | | public: |
153 | 7 | static T get_kv(const gguf_context * ctx, const int k) { |
154 | 7 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); |
155 | | |
156 | 7 | if (kt != GKV::gt) { |
157 | 0 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", |
158 | 0 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); |
159 | 0 | } |
160 | 7 | return GKV::getter(ctx, k); |
161 | 7 | } Unexecuted instantiation: GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<bool>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<float>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::get_kv(gguf_context const*, int) GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::get_kv(gguf_context const*, int) Line | Count | Source | 153 | 7 | static T get_kv(const gguf_context * ctx, const int k) { | 154 | 7 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); | 155 | | | 156 | 7 | if (kt != GKV::gt) { | 157 | 0 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", | 158 | 0 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); | 159 | 0 | } | 160 | 7 | return GKV::getter(ctx, k); | 161 | 7 | } |
Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<int>::get_kv(gguf_context const*, int) |
162 | | |
163 | 0 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
164 | 0 | switch (ty) { |
165 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; |
166 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; |
167 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; |
168 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; |
169 | 0 | } |
170 | 0 | return "unknown"; |
171 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<float>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<int>::override_type_to_str(llama_model_kv_override_type) |
172 | | |
173 | 410 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
174 | 410 | if (!ovrd) { return false; } |
175 | 0 | if (ovrd->tag == expected_type) { |
176 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", |
177 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
178 | 0 | switch (ovrd->tag) { |
179 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
180 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); |
181 | 0 | } break; |
182 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
183 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); |
184 | 0 | } break; |
185 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
186 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); |
187 | 0 | } break; |
188 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
189 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); |
190 | 0 | } break; |
191 | 0 | default: |
192 | | // Shouldn't be possible to end up here, but just in case... |
193 | 0 | throw std::runtime_error( |
194 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", |
195 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); |
196 | 0 | } |
197 | 0 | return true; |
198 | 0 | } |
199 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", |
200 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
201 | 0 | return false; |
202 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 173 | 95 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 174 | 95 | if (!ovrd) { return false; } | 175 | 0 | if (ovrd->tag == expected_type) { | 176 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 177 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 178 | 0 | switch (ovrd->tag) { | 179 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 180 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 181 | 0 | } break; | 182 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 183 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 184 | 0 | } break; | 185 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 186 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 187 | 0 | } break; | 188 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 189 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 190 | 0 | } break; | 191 | 0 | default: | 192 | | // Shouldn't be possible to end up here, but just in case... | 193 | 0 | throw std::runtime_error( | 194 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 195 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 196 | 0 | } | 197 | 0 | return true; | 198 | 0 | } | 199 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 200 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 201 | 0 | return false; | 202 | 0 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 173 | 225 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 174 | 225 | if (!ovrd) { return false; } | 175 | 0 | if (ovrd->tag == expected_type) { | 176 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 177 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 178 | 0 | switch (ovrd->tag) { | 179 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 180 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 181 | 0 | } break; | 182 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 183 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 184 | 0 | } break; | 185 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 186 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 187 | 0 | } break; | 188 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 189 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 190 | 0 | } break; | 191 | 0 | default: | 192 | | // Shouldn't be possible to end up here, but just in case... | 193 | 0 | throw std::runtime_error( | 194 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 195 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 196 | 0 | } | 197 | 0 | return true; | 198 | 0 | } | 199 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 200 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 201 | 0 | return false; | 202 | 0 | } |
GGUFMeta::GKV<unsigned short>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 173 | 90 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 174 | 90 | if (!ovrd) { return false; } | 175 | 0 | if (ovrd->tag == expected_type) { | 176 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 177 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 178 | 0 | switch (ovrd->tag) { | 179 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 180 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 181 | 0 | } break; | 182 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 183 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 184 | 0 | } break; | 185 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 186 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 187 | 0 | } break; | 188 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 189 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 190 | 0 | } break; | 191 | 0 | default: | 192 | | // Shouldn't be possible to end up here, but just in case... | 193 | 0 | throw std::runtime_error( | 194 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 195 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 196 | 0 | } | 197 | 0 | return true; | 198 | 0 | } | 199 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 200 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 201 | 0 | return false; | 202 | 0 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) |
203 | | |
204 | | template<typename OT> |
205 | | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
206 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
207 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
208 | 0 | target = ovrd->val_bool; |
209 | 0 | return true; |
210 | 0 | } |
211 | 0 | return false; |
212 | 0 | } |
213 | | |
214 | | template<typename OT> |
215 | | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
216 | 185 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
217 | 185 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
218 | 0 | target = ovrd->val_i64; |
219 | 0 | return true; |
220 | 0 | } |
221 | 185 | return false; |
222 | 185 | } _ZN8GGUFMeta3GKVIjE12try_overrideIjEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Line | Count | Source | 216 | 95 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { | 217 | 95 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { | 218 | 0 | target = ovrd->val_i64; | 219 | 0 | return true; | 220 | 0 | } | 221 | 95 | return false; | 222 | 95 | } |
_ZN8GGUFMeta3GKVItE12try_overrideItEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Line | Count | Source | 216 | 90 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { | 217 | 90 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { | 218 | 0 | target = ovrd->val_i64; | 219 | 0 | return true; | 220 | 0 | } | 221 | 90 | return false; | 222 | 90 | } |
Unexecuted instantiation: _ZN8GGUFMeta3GKVIiE12try_overrideIiEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override |
223 | | |
224 | | template<typename OT> |
225 | | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
226 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
227 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
228 | 0 | target = ovrd->val_f64; |
229 | 0 | return true; |
230 | 0 | } |
231 | 0 | return false; |
232 | 0 | } |
233 | | |
234 | | template<typename OT> |
235 | | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
236 | 225 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
237 | 225 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
238 | 0 | target = ovrd->val_str; |
239 | 0 | return true; |
240 | 0 | } |
241 | 225 | return false; |
242 | 225 | } |
243 | | |
244 | 410 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
245 | 410 | if (try_override<T>(target, ovrd)) { |
246 | 0 | return true; |
247 | 0 | } |
248 | 410 | if (k < 0) { return false; } |
249 | 7 | target = get_kv(ctx, k); |
250 | 7 | return true; |
251 | 410 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, int, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, int, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, int, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 244 | 95 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 245 | 95 | if (try_override<T>(target, ovrd)) { | 246 | 0 | return true; | 247 | 0 | } | 248 | 95 | if (k < 0) { return false; } | 249 | 0 | target = get_kv(ctx, k); | 250 | 0 | return true; | 251 | 95 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 244 | 225 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 245 | 225 | if (try_override<T>(target, ovrd)) { | 246 | 0 | return true; | 247 | 0 | } | 248 | 225 | if (k < 0) { return false; } | 249 | 7 | target = get_kv(ctx, k); | 250 | 7 | return true; | 251 | 225 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, int, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 244 | 90 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 245 | 90 | if (try_override<T>(target, ovrd)) { | 246 | 0 | return true; | 247 | 0 | } | 248 | 90 | if (k < 0) { return false; } | 249 | 0 | target = get_kv(ctx, k); | 250 | 0 | return true; | 251 | 90 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, int, int&, llama_model_kv_override const*) |
252 | | |
253 | 410 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
254 | 410 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
255 | 410 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, char const*, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, char const*, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, char const*, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 253 | 95 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 254 | 95 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 255 | 95 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 253 | 225 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 254 | 225 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 255 | 225 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, char const*, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 253 | 90 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 254 | 90 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 255 | 90 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, char const*, int&, llama_model_kv_override const*) |
256 | | |
257 | 410 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
258 | 410 | return set(ctx, key.c_str(), target, ovrd); |
259 | 410 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 257 | 95 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 258 | 95 | return set(ctx, key.c_str(), target, ovrd); | 259 | 95 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 257 | 225 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 258 | 225 | return set(ctx, key.c_str(), target, ovrd); | 259 | 225 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 257 | 90 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 258 | 90 | return set(ctx, key.c_str(), target, ovrd); | 259 | 90 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, llama_model_kv_override const*) |
260 | | }; |
261 | | } |
262 | | |
263 | | template<typename T> |
264 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
265 | 0 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
266 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
267 | |
|
268 | 0 | if (kid < 0) { |
269 | 0 | if (required) { |
270 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
271 | 0 | } |
272 | 0 | return false; |
273 | 0 | } |
274 | | |
275 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
276 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
277 | | |
278 | |
|
279 | 0 | result = arr_info.length; |
280 | 0 | return true; |
281 | 0 | } |
282 | | |
283 | | template<typename T> |
284 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
285 | 0 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
286 | 0 | return get_arr_n(llm_kv(kid), result, required); |
287 | 0 | } |
288 | | |
289 | | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
290 | | |
291 | | template<typename T> |
292 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
293 | 0 | const gguf_context * ctx = meta.get(); |
294 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
295 | |
|
296 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
297 | 0 | if (required) { |
298 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
299 | 0 | } |
300 | 0 | return false; |
301 | 0 | } |
302 | | |
303 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
304 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
305 | |
|
306 | 0 | switch (arr_info.gt) { |
307 | 0 | case GGUF_TYPE_UINT32: |
308 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
309 | 0 | (std::is_same<T, uint32_t>::value)); break; |
310 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
311 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
312 | 0 | default: |
313 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
314 | 0 | } |
315 | | |
316 | 0 | if constexpr (std::is_same<T, std::string>::value) { |
317 | 0 | const size_t n_items = gguf_get_arr_n(ctx, kid); |
318 | 0 | result.clear(); |
319 | |
|
320 | 0 | for (size_t i = 0; i < n_items; i++) { |
321 | 0 | const T value = gguf_get_arr_str(ctx, kid, i); |
322 | 0 | result.emplace_back(value); |
323 | 0 | } |
324 | | } else { |
325 | | result.resize(arr_info.length); |
326 | | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
327 | | } |
328 | |
|
329 | 0 | return true; |
330 | 0 | } |
331 | | |
332 | | template<typename T, size_t N_MAX> |
333 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
334 | 0 | const gguf_context * ctx = meta.get(); |
335 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
336 | |
|
337 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
338 | 0 | if (required) { |
339 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
340 | 0 | } |
341 | 0 | return false; |
342 | 0 | } |
343 | | |
344 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
345 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
346 | |
|
347 | 0 | switch (arr_info.gt) { |
348 | 0 | case GGUF_TYPE_BOOL: |
349 | 0 | case GGUF_TYPE_UINT32: |
350 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
351 | 0 | (std::is_same<T, uint32_t>::value)); break; |
352 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
353 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
354 | 0 | default: |
355 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
356 | 0 | } |
357 | | |
358 | 0 | if (arr_info.length > N_MAX) { |
359 | 0 | throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
360 | 0 | } |
361 | | |
362 | | if constexpr (std::is_same<T, std::string>::value) { |
363 | | const size_t n_items = gguf_get_arr_n(ctx, kid); |
364 | | |
365 | | for (size_t i = 0; i < n_items; i++) { |
366 | | const T value = gguf_get_arr_str(ctx, kid, i); |
367 | | result[i] = value; |
368 | | } |
369 | 0 | } else { |
370 | 0 | if (arr_info.gt == GGUF_TYPE_BOOL) { |
371 | 0 | std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) { |
372 | 0 | return static_cast<T>(x); |
373 | 0 | }); Unexecuted instantiation: llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool)::{lambda(bool)#1}::operator()(bool) constUnexecuted instantiation: llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool)::{lambda(bool)#1}::operator()(bool) constUnexecuted instantiation: llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool)::{lambda(bool)#1}::operator()(bool) const |
374 | 0 | } else { |
375 | 0 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
376 | 0 | } |
377 | 0 | } |
378 | |
|
379 | 0 | return true; |
380 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool) |
381 | | |
382 | | template<typename T> |
383 | 0 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
384 | 0 | return get_arr(llm_kv(kid), result, required); |
385 | 0 | } |
386 | | |
387 | | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
388 | | |
389 | | template<typename T> |
390 | 410 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
391 | 410 | auto it = kv_overrides.find(key); |
392 | | |
393 | 410 | const struct llama_model_kv_override * override = |
394 | 410 | it != kv_overrides.end() ? &it->second : nullptr; |
395 | | |
396 | 410 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); |
397 | | |
398 | 410 | if (required && !found) { |
399 | 5 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
400 | 5 | } |
401 | | |
402 | 405 | return found; |
403 | 410 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, bool) bool llama_model_loader::get_key<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, bool) Line | Count | Source | 390 | 95 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 391 | 95 | auto it = kv_overrides.find(key); | 392 | | | 393 | 95 | const struct llama_model_kv_override * override = | 394 | 95 | it != kv_overrides.end() ? &it->second : nullptr; | 395 | | | 396 | 95 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 397 | | | 398 | 95 | if (required && !found) { | 399 | 5 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 400 | 5 | } | 401 | | | 402 | 90 | return found; | 403 | 95 | } |
bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Line | Count | Source | 390 | 225 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 391 | 225 | auto it = kv_overrides.find(key); | 392 | | | 393 | 225 | const struct llama_model_kv_override * override = | 394 | 225 | it != kv_overrides.end() ? &it->second : nullptr; | 395 | | | 396 | 225 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 397 | | | 398 | 225 | if (required && !found) { | 399 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 400 | 0 | } | 401 | | | 402 | 225 | return found; | 403 | 225 | } |
bool llama_model_loader::get_key<unsigned short>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, bool) Line | Count | Source | 390 | 90 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 391 | 90 | auto it = kv_overrides.find(key); | 392 | | | 393 | 90 | const struct llama_model_kv_override * override = | 394 | 90 | it != kv_overrides.end() ? &it->second : nullptr; | 395 | | | 396 | 90 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 397 | | | 398 | 90 | if (required && !found) { | 399 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 400 | 0 | } | 401 | | | 402 | 90 | return found; | 403 | 90 | } |
Unexecuted instantiation: bool llama_model_loader::get_key<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, bool) |
404 | | |
405 | | template<typename T> |
406 | 100 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
407 | 100 | return get_key(llm_kv(kid), result, required); |
408 | 100 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(llm_kv, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(llm_kv, float&, bool) bool llama_model_loader::get_key<unsigned int>(llm_kv, unsigned int&, bool) Line | Count | Source | 406 | 95 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { | 407 | 95 | return get_key(llm_kv(kid), result, required); | 408 | 95 | } |
bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(llm_kv, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Line | Count | Source | 406 | 5 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { | 407 | 5 | return get_key(llm_kv(kid), result, required); | 408 | 5 | } |
|
409 | | |
410 | | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
411 | | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
412 | | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
413 | | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
414 | | |
415 | | template<> |
416 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
417 | 0 | uint32_t tmp; |
418 | 0 | const bool found = get_key(kid, tmp, required); |
419 | 0 | if (found) { |
420 | 0 | result = (enum llama_pooling_type) tmp; |
421 | 0 | } else { |
422 | 0 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
423 | 0 | } |
424 | 0 | return found; |
425 | 0 | } |
426 | | |
427 | | // get array of n <= N_MAX elements, or a single element repeated n times |
428 | | template<typename T, size_t N_MAX> |
429 | 0 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
430 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
431 | |
|
432 | 0 | if (kid < 0) { |
433 | 0 | if (required) { |
434 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
435 | 0 | } |
436 | 0 | return false; |
437 | 0 | } |
438 | | |
439 | 0 | if (n > N_MAX) { |
440 | 0 | throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
441 | 0 | } |
442 | | |
443 | 0 | if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { |
444 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
445 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
446 | |
|
447 | 0 | if (n != arr_info.length) { |
448 | 0 | throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); |
449 | 0 | } |
450 | | |
451 | 0 | return get_arr(key, result, required); |
452 | 0 | } |
453 | | |
454 | 0 | T value; |
455 | |
|
456 | 0 | bool ok = get_key(key, value, required); |
457 | 0 | if (!ok) { |
458 | 0 | return false; |
459 | 0 | } |
460 | | |
461 | 0 | for (uint32_t i = 0; i < n; i++) { |
462 | 0 | result[i] = value; |
463 | 0 | } |
464 | |
|
465 | 0 | return true; |
466 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, unsigned int, bool) |
467 | | |
468 | | template<typename T> |
469 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
470 | 0 | return get_key_or_arr(llm_kv(kid), result, n, required); |
471 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<int, 4ul> >(llm_kv, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<unsigned int, 512ul> >(llm_kv, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<float, 512ul> >(llm_kv, std::__1::array<float, 512ul>&, unsigned int, bool) |
472 | | |
473 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) { |
474 | 0 | const std::string key = llm_kv(kid); |
475 | |
|
476 | 0 | const int id = gguf_find_key(meta.get(), key.c_str()); |
477 | |
|
478 | 0 | if (id < 0) { |
479 | 0 | if (required) { |
480 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
481 | 0 | } |
482 | 0 | return false; |
483 | 0 | } |
484 | | |
485 | | // throw and error if type is an array |
486 | 0 | if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) { |
487 | 0 | if (required) { |
488 | 0 | throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str())); |
489 | 0 | } |
490 | 0 | return false; |
491 | 0 | } |
492 | | |
493 | 0 | return get_key(key, result, required); |
494 | 0 | } |
495 | | |
496 | | // TODO: this is not very clever - figure out something better |
497 | | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
498 | | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
499 | | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
500 | | |
501 | | |
502 | | llama_model_loader::llama_model_loader( |
503 | | const std::string & fname, |
504 | | std::vector<std::string> & splits, |
505 | | bool use_mmap, |
506 | | bool use_direct_io, |
507 | | bool check_tensors, |
508 | | bool no_alloc, |
509 | | const llama_model_kv_override * param_overrides_p, |
510 | 423 | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { |
511 | 423 | int trace = 0; |
512 | 423 | if (getenv("LLAMA_TRACE")) { |
513 | 0 | trace = atoi(getenv("LLAMA_TRACE")); |
514 | 0 | } |
515 | | |
516 | 423 | if (param_overrides_p != nullptr) { |
517 | 0 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
518 | 0 | kv_overrides.insert({std::string(p->key), *p}); |
519 | 0 | } |
520 | 0 | } |
521 | | |
522 | 423 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
523 | | |
524 | | // Load the main GGUF |
525 | 423 | struct ggml_context * ctx = NULL; |
526 | 423 | struct gguf_init_params params = { |
527 | 423 | /*.no_alloc = */ true, |
528 | 423 | /*.ctx = */ &ctx, |
529 | 423 | }; |
530 | | |
531 | 423 | meta.reset(gguf_init_from_file(fname.c_str(), params)); |
532 | 423 | if (!meta) { |
533 | 195 | throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); |
534 | 195 | } |
535 | | |
536 | 228 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
537 | 228 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
538 | | |
539 | 228 | files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); |
540 | 228 | contexts.emplace_back(ctx); |
541 | | |
542 | 228 | use_direct_io = use_direct_io && files.back()->has_direct_io(); |
543 | | |
544 | | // Disable mmap in case Direct I/O is enabled and available |
545 | 228 | if (use_direct_io && use_mmap) { |
546 | 0 | use_mmap = false; |
547 | 0 | LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); |
548 | 0 | } |
549 | | |
550 | | // Save tensors data offset of the main file. |
551 | | // For subsidiary files, `meta` tensor data offset must not be used, |
552 | | // so we build a unified tensors index for weights. |
553 | 1.16k | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
554 | 936 | std::string tensor_name = std::string(cur->name); |
555 | | // make sure there is no duplicated tensor names |
556 | 936 | if (weights_map.find(tensor_name) != weights_map.end()) { |
557 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
558 | 0 | } |
559 | 936 | n_elements += ggml_nelements(cur); |
560 | 936 | n_bytes += ggml_nbytes(cur); |
561 | 936 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); |
562 | 936 | } |
563 | 228 | uint16_t n_split = 0; |
564 | 228 | get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); |
565 | | |
566 | | // Load additional GGML contexts |
567 | 228 | if (n_split > 1) { |
568 | | // make sure the main file is loaded first |
569 | 0 | uint16_t idx = 0; |
570 | 0 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
571 | 0 | get_key(kv_split_no, idx); |
572 | 0 | if (idx != 0) { |
573 | 0 | throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); |
574 | 0 | } |
575 | | |
576 | | // generate list of splits if needed |
577 | 0 | if (splits.empty()) { |
578 | 0 | splits = llama_get_list_splits(fname, idx, n_split); |
579 | 0 | } |
580 | | |
581 | | // in case user give a custom list of splits, check if it matches the expected number |
582 | 0 | if (n_split != (uint16_t)splits.size()) { |
583 | 0 | throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); |
584 | 0 | } |
585 | | |
586 | 0 | if (trace > 0) { |
587 | 0 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); |
588 | 0 | } |
589 | | |
590 | | // load other splits |
591 | 0 | for (idx = 1; idx < n_split; idx++) { |
592 | 0 | const char * fname_split = splits[idx].c_str(); |
593 | |
|
594 | 0 | struct gguf_init_params split_params = { |
595 | 0 | /*.no_alloc = */ true, |
596 | 0 | /*.ctx = */ &ctx, |
597 | 0 | }; |
598 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; |
599 | 0 | if (!ctx_gguf) { |
600 | 0 | throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); |
601 | 0 | } |
602 | | |
603 | | // check idx |
604 | 0 | { |
605 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); |
606 | 0 | if (kid < 0) { |
607 | 0 | throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); |
608 | 0 | } |
609 | 0 | int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); |
610 | 0 | if (idx_gguf != idx) { |
611 | 0 | throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); |
612 | 0 | } |
613 | 0 | } |
614 | | |
615 | 0 | files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); |
616 | 0 | contexts.emplace_back(ctx); |
617 | | |
618 | | // Save tensors data offset info of the shard. |
619 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
620 | 0 | std::string tensor_name = std::string(cur->name); |
621 | | // make sure there is no duplicated tensor names |
622 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
623 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
624 | 0 | } |
625 | 0 | n_elements += ggml_nelements(cur); |
626 | 0 | n_bytes += ggml_nbytes(cur); |
627 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
628 | 0 | } |
629 | 0 | } |
630 | | |
631 | 0 | get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); |
632 | | |
633 | | // sanity check |
634 | 0 | { |
635 | 0 | const int n_tensors_loaded = (int) weights_map.size(); |
636 | 0 | if (n_tensors != n_tensors_loaded) { |
637 | 0 | throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); |
638 | 0 | } |
639 | 0 | } |
640 | | |
641 | 0 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); |
642 | 0 | } |
643 | | |
644 | 228 | n_kv = gguf_get_n_kv(meta.get()); |
645 | 228 | n_tensors = weights_map.size(); |
646 | | |
647 | 228 | fver = (enum llama_fver) gguf_get_version(meta.get()); |
648 | | |
649 | 228 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", |
650 | 228 | __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); |
651 | | |
652 | | // determine file type based on the number of tensors for each quantization and print meta data |
653 | | // TODO: make optional |
654 | 228 | { |
655 | 228 | std::map<enum ggml_type, uint32_t> n_type; |
656 | | |
657 | 228 | uint32_t n_type_max = 0; |
658 | 228 | enum ggml_type type_max = GGML_TYPE_F32; |
659 | | |
660 | 334 | for (const auto & it : weights_map) { |
661 | 334 | const llama_tensor_weight & w = it.second; |
662 | 334 | const ggml_tensor * tensor = w.tensor; |
663 | | |
664 | 334 | enum ggml_type type = tensor->type; |
665 | | |
666 | 334 | n_type[type]++; |
667 | | |
668 | 334 | if (n_type_max < n_type[type]) { |
669 | 263 | n_type_max = n_type[type]; |
670 | 263 | type_max = type; |
671 | 263 | } |
672 | | |
673 | 334 | if (trace > 0) { |
674 | 0 | const uint16_t sid = w.idx; |
675 | 0 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, |
676 | 0 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
677 | 0 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
678 | 0 | } |
679 | 334 | } |
680 | | |
681 | 228 | switch (type_max) { |
682 | 84 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
683 | 1 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
684 | 0 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
685 | 0 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
686 | 0 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
687 | 0 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
688 | 0 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
689 | 0 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
690 | 0 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
691 | 0 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
692 | 0 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
693 | 0 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
694 | 0 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
695 | 0 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
696 | 0 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
697 | 0 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
698 | 0 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
699 | 0 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
700 | 0 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
701 | 0 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
702 | 0 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
703 | 0 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
704 | 0 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
705 | 0 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
706 | 5 | default: |
707 | 5 | { |
708 | 5 | LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); |
709 | 5 | ftype = LLAMA_FTYPE_ALL_F32; |
710 | 5 | } break; |
711 | 228 | } |
712 | | |
713 | | // this is a way to mark that we have "guessed" the file type |
714 | 90 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
715 | | |
716 | 90 | { |
717 | 90 | uint32_t ftype_val = 0; |
718 | 90 | if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { |
719 | 0 | ftype = (llama_ftype) ftype_val; |
720 | 0 | } |
721 | 90 | } |
722 | | |
723 | 90 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); |
724 | | |
725 | 1.18k | for (int i = 0; i < n_kv; i++) { |
726 | 1.09k | const char * name = gguf_get_key(meta.get(), i); |
727 | 1.09k | const enum gguf_type type = gguf_get_kv_type(meta.get(), i); |
728 | 1.09k | const std::string type_name = |
729 | 1.09k | type == GGUF_TYPE_ARRAY |
730 | 1.09k | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) |
731 | 1.09k | : gguf_type_name(type); |
732 | | |
733 | 1.09k | std::string value = gguf_kv_to_str(meta.get(), i); |
734 | 1.09k | const size_t MAX_VALUE_LEN = 40; |
735 | 1.09k | if (value.size() > MAX_VALUE_LEN) { |
736 | 34 | value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); |
737 | 34 | } |
738 | 1.09k | replace_all(value, "\n", "\\n"); |
739 | | |
740 | 1.09k | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); |
741 | 1.09k | } |
742 | | |
743 | | // print type counts |
744 | 125 | for (auto & kv : n_type) { |
745 | 125 | if (kv.second == 0) { |
746 | 0 | continue; |
747 | 0 | } |
748 | | |
749 | 125 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); |
750 | 125 | } |
751 | 90 | } |
752 | | |
753 | 90 | if (!llama_mmap::SUPPORTED) { |
754 | 0 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); |
755 | 0 | use_mmap = false; |
756 | 0 | } |
757 | | |
758 | 90 | this->use_mmap = use_mmap; |
759 | 90 | this->use_direct_io = use_direct_io; |
760 | 90 | this->check_tensors = check_tensors; |
761 | 90 | this->no_alloc = no_alloc; |
762 | 90 | } |
763 | | |
764 | 85 | std::string llama_model_loader::get_arch_name() const { |
765 | 85 | return arch_name; |
766 | 85 | } |
767 | | |
768 | 95 | enum llm_arch llama_model_loader::get_arch() const { |
769 | 95 | return llm_kv.arch; |
770 | 95 | } |
771 | | |
772 | 0 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
773 | 0 | auto pos = weights_map.find(name); |
774 | 0 | if (pos != weights_map.end()) { |
775 | 0 | return &pos->second; |
776 | 0 | } |
777 | | |
778 | 0 | return nullptr; |
779 | 0 | } |
780 | | |
781 | 0 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
782 | 0 | const llama_tensor_weight * weight = get_weight(name); |
783 | 0 | if (!weight) { |
784 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); |
785 | 0 | } |
786 | 0 | return *weight; |
787 | 0 | } |
788 | | |
789 | 0 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
790 | 0 | const auto * weight = get_weight(name); |
791 | 0 | if (!weight) { |
792 | 0 | return nullptr; |
793 | 0 | } |
794 | 0 | return weight->tensor; |
795 | 0 | } |
796 | | |
797 | 0 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
798 | 0 | struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); |
799 | 0 | if (!tensor) { |
800 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
801 | 0 | } |
802 | 0 | return tensor; |
803 | 0 | } |
804 | | |
805 | 0 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
806 | 0 | const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); |
807 | |
|
808 | 0 | if (cur == NULL) { |
809 | 0 | if (!required) { |
810 | 0 | return NULL; |
811 | 0 | } |
812 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
813 | 0 | } |
814 | | |
815 | 0 | { |
816 | 0 | bool is_ok = true; |
817 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
818 | 0 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
819 | 0 | is_ok = false; |
820 | 0 | break; |
821 | 0 | } |
822 | 0 | } |
823 | 0 | if (!is_ok) { |
824 | 0 | throw std::runtime_error( |
825 | 0 | format("%s: tensor '%s' has wrong shape; expected %s, got %s", |
826 | 0 | __func__, name.c_str(), |
827 | 0 | llama_format_tensor_shape(ne).c_str(), |
828 | 0 | llama_format_tensor_shape(cur).c_str())); |
829 | 0 | } |
830 | 0 | } |
831 | | |
832 | 0 | return cur; |
833 | 0 | } |
834 | | |
835 | 0 | struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) { |
836 | 0 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str()); |
837 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); |
838 | |
|
839 | 0 | if (cur == NULL) { |
840 | 0 | return NULL; |
841 | 0 | } |
842 | | |
843 | 0 | bool duplicated = flags & TENSOR_DUPLICATED; |
844 | |
|
845 | 0 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); |
846 | 0 | ggml_set_name(tensor, ggml_get_name(cur)); |
847 | |
|
848 | 0 | if (duplicated) { |
849 | 0 | size_data += ggml_nbytes(cur); |
850 | 0 | } else { |
851 | 0 | n_created++; |
852 | 0 | } |
853 | |
|
854 | 0 | return tensor; |
855 | |
|
856 | 0 | } |
857 | | |
858 | 0 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
859 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
860 | |
|
861 | 0 | if (cur == NULL) { |
862 | 0 | return NULL; |
863 | 0 | } |
864 | | |
865 | 0 | if (cur->type != base->type) { |
866 | 0 | throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); |
867 | 0 | } |
868 | | |
869 | 0 | std::array<int64_t, GGML_MAX_DIMS> dims; |
870 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
871 | 0 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
872 | 0 | } |
873 | |
|
874 | 0 | struct ggml_tensor * tensor = ggml_view_4d(ctx, base, |
875 | 0 | dims[0], dims[1], dims[2], dims[3], |
876 | 0 | cur->nb[1], cur->nb[2], cur->nb[3], |
877 | 0 | offset); |
878 | |
|
879 | 0 | ggml_set_name(tensor, name.c_str()); |
880 | |
|
881 | 0 | n_created++; |
882 | |
|
883 | 0 | return tensor; |
884 | 0 | } |
885 | | |
886 | 0 | void llama_model_loader::done_getting_tensors() const { |
887 | 0 | if (n_created != n_tensors) { |
888 | 0 | throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); |
889 | 0 | } |
890 | 0 | } |
891 | | |
892 | 0 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
893 | 0 | if (use_mmap) { |
894 | 0 | mappings.reserve(files.size()); |
895 | 0 | mmaps_used.reserve(files.size()); |
896 | 0 | for (const auto & file : files) { |
897 | 0 | bool is_numa = false; |
898 | |
|
899 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
900 | 0 | if (dev) { |
901 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
902 | 0 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); |
903 | 0 | if (is_numa_fn) { |
904 | 0 | is_numa = is_numa_fn(); |
905 | 0 | } |
906 | 0 | } |
907 | |
|
908 | 0 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa); |
909 | 0 | mmaps_used.emplace_back(mapping->size(), 0); |
910 | 0 | if (mlock_mmaps) { |
911 | 0 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
912 | 0 | mlock_mmap->init(mapping->addr()); |
913 | 0 | mlock_mmaps->emplace_back(std::move(mlock_mmap)); |
914 | 0 | } |
915 | 0 | mappings.emplace_back(std::move(mapping)); |
916 | 0 | } |
917 | 0 | } |
918 | | |
919 | | // compute the total size of all tensors for progress reporting |
920 | 0 | for (const auto & it : weights_map) { |
921 | 0 | size_data += ggml_nbytes(it.second.tensor); |
922 | 0 | } |
923 | 0 | } |
924 | | |
925 | 0 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
926 | 0 | GGML_ASSERT(!mappings.empty()); |
927 | 0 | const auto & mapping = mappings.at(idx); |
928 | |
|
929 | 0 | *first = mapping->size(); |
930 | 0 | *last = 0; |
931 | 0 | *addr = mapping->addr(); |
932 | 0 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
933 | 0 | const auto * weight = get_weight(ggml_get_name(tensor)); |
934 | 0 | if (!weight || weight->idx != idx) { |
935 | 0 | continue; |
936 | 0 | } |
937 | 0 | *first = std::min(*first, weight->offs); |
938 | 0 | *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); |
939 | 0 | } |
940 | 0 | } |
941 | | |
942 | 0 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
943 | 0 | const auto & w = require_weight(ggml_get_name(cur)); |
944 | |
|
945 | 0 | if (use_mmap) { |
946 | 0 | const auto & mapping = mappings.at(w.idx); |
947 | 0 | if (cur->data == nullptr) { |
948 | 0 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
949 | 0 | } else { |
950 | 0 | memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); |
951 | 0 | } |
952 | 0 | } else { |
953 | 0 | GGML_ASSERT(cur->data != nullptr); |
954 | 0 | GGML_ASSERT(w.idx < files.size()); |
955 | 0 | const auto & file = files.at(w.idx); |
956 | 0 | file->seek(w.offs, SEEK_SET); |
957 | 0 | file->read_raw(cur->data, ggml_nbytes(cur)); |
958 | 0 | } |
959 | |
|
960 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { |
961 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
962 | 0 | } |
963 | 0 | } |
964 | | |
965 | | bool llama_model_loader::load_all_data( |
966 | | struct ggml_context * ctx, |
967 | | llama_buf_map & bufs, |
968 | | llama_mlocks * lmlocks, |
969 | | llama_progress_callback progress_callback, |
970 | 0 | void * progress_callback_user_data) { |
971 | 0 | GGML_ASSERT(size_data != 0 && "call init_mappings() first"); |
972 | |
|
973 | 0 | std::vector<no_init<uint8_t>> read_buf; |
974 | 0 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
975 | | |
976 | | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
977 | | // NVMe raid configurations might require more / larger buffers. |
978 | 0 | constexpr size_t n_buffers = 4; |
979 | |
|
980 | 0 | size_t alignment = 1; |
981 | 0 | for (const auto & file : files) { |
982 | 0 | alignment = std::max(file->read_alignment(), alignment); |
983 | 0 | } |
984 | | |
985 | | // Buffer size: balance between memory usage and I/O efficiency |
986 | | // 64MB works well for NVMe drives |
987 | 0 | const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; |
988 | |
|
989 | 0 | std::vector<ggml_backend_buffer_t> host_buffers; |
990 | 0 | std::vector<ggml_backend_event_t> events; |
991 | 0 | std::vector<void *> host_ptrs; |
992 | 0 | size_t buffer_idx = 0; // buffer to use for async loads |
993 | 0 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
994 | 0 | if (use_mmap || check_tensors) { |
995 | 0 | return nullptr; |
996 | 0 | } |
997 | | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
998 | | // First determine if the backend supports the necessary features for async uploads. |
999 | 0 | auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; |
1000 | 0 | if (!buf) { |
1001 | 0 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); |
1002 | 0 | return nullptr; |
1003 | 0 | } |
1004 | | |
1005 | 0 | auto * buft = ggml_backend_buffer_get_type(buf); |
1006 | 0 | auto * dev = ggml_backend_buft_get_device(buft); |
1007 | 0 | if (!dev) { |
1008 | 0 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, |
1009 | 0 | ggml_backend_buft_name(buft)); |
1010 | 0 | return nullptr; |
1011 | 0 | } |
1012 | | |
1013 | 0 | if (buft != ggml_backend_dev_buffer_type(dev)) { |
1014 | 0 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, |
1015 | 0 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
1016 | 0 | return nullptr; |
1017 | 0 | } |
1018 | | |
1019 | 0 | ggml_backend_dev_props props; |
1020 | 0 | ggml_backend_dev_get_props(dev, &props); |
1021 | 0 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
1022 | 0 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, |
1023 | 0 | ggml_backend_dev_name(dev)); |
1024 | 0 | return nullptr; |
1025 | 0 | } |
1026 | | |
1027 | 0 | auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
1028 | 0 | if (!host_buft) { |
1029 | 0 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, |
1030 | 0 | ggml_backend_dev_name(dev)); |
1031 | 0 | return nullptr; |
1032 | 0 | } |
1033 | | |
1034 | | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
1035 | 0 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
1036 | 0 | auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); |
1037 | |
|
1038 | 0 | if (!buf) { |
1039 | 0 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, |
1040 | 0 | ggml_backend_dev_name(dev)); |
1041 | 0 | return nullptr; |
1042 | 0 | } |
1043 | | |
1044 | 0 | host_buffers.emplace_back(buf); |
1045 | 0 | host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); |
1046 | |
|
1047 | 0 | auto * event = ggml_backend_event_new(dev); |
1048 | 0 | if (!event) { |
1049 | 0 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, |
1050 | 0 | ggml_backend_dev_name(dev)); |
1051 | 0 | return nullptr; |
1052 | 0 | } |
1053 | | |
1054 | 0 | events.emplace_back(event); |
1055 | 0 | } |
1056 | | |
1057 | 0 | ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
1058 | 0 | if (!backend) { |
1059 | 0 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, |
1060 | 0 | ggml_backend_dev_name(dev)); |
1061 | 0 | return nullptr; |
1062 | 0 | } |
1063 | | |
1064 | 0 | return backend; |
1065 | 0 | }(__func__); |
1066 | |
|
1067 | 0 | if (upload_backend) { |
1068 | 0 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, |
1069 | 0 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
1070 | 0 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
1071 | 0 | ggml_backend_name(upload_backend)); |
1072 | 0 | } |
1073 | |
|
1074 | 0 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { |
1075 | 0 | const auto * weight = get_weight(ggml_get_name(cur)); |
1076 | 0 | if (weight == nullptr) { |
1077 | | // this can happen with split experts models |
1078 | 0 | continue; |
1079 | 0 | } |
1080 | | |
1081 | 0 | if (progress_callback) { |
1082 | 0 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
1083 | 0 | return false; |
1084 | 0 | } |
1085 | 0 | } |
1086 | | |
1087 | 0 | size_t n_size = ggml_nbytes(cur); |
1088 | |
|
1089 | 0 | if (use_mmap) { |
1090 | 0 | const auto & mapping = mappings.at(weight->idx); |
1091 | 0 | ggml_backend_buffer_t buf_mmap = nullptr; |
1092 | 0 | if (bufs.count(weight->idx)) { |
1093 | 0 | buf_mmap = bufs.at(weight->idx); |
1094 | 0 | } |
1095 | 0 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
1096 | |
|
1097 | 0 | if (check_tensors) { |
1098 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { |
1099 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); |
1100 | 0 | })); |
1101 | 0 | } |
1102 | |
|
1103 | 0 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
1104 | 0 | if (buf_mmap && cur->data == nullptr) { |
1105 | 0 | ggml_backend_tensor_alloc(buf_mmap, cur, data); |
1106 | 0 | if (lmlocks) { |
1107 | 0 | const auto & lmlock = lmlocks->at(weight->idx); |
1108 | 0 | lmlock->grow_to(weight->offs + n_size); |
1109 | 0 | } |
1110 | |
|
1111 | 0 | auto & mmap_used = mmaps_used[weight->idx]; |
1112 | 0 | mmap_used.first = std::min(mmap_used.first, weight->offs); |
1113 | 0 | mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); |
1114 | 0 | } else { |
1115 | 0 | ggml_backend_tensor_set(cur, data, 0, n_size); |
1116 | 0 | } |
1117 | 0 | } else { |
1118 | 0 | const auto & file = files.at(weight->idx); |
1119 | |
|
1120 | 0 | if (ggml_backend_buffer_is_host(cur->buffer)) { |
1121 | 0 | file->seek(weight->offs, SEEK_SET); |
1122 | 0 | file->read_raw(cur->data, n_size); |
1123 | 0 | if (check_tensors) { |
1124 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { |
1125 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); |
1126 | 0 | })); |
1127 | 0 | } |
1128 | 0 | } else { |
1129 | | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
1130 | 0 | if (upload_backend) { |
1131 | 0 | size_t offset = weight->offs; |
1132 | 0 | alignment = file->read_alignment(); |
1133 | 0 | size_t aligned_offset = offset & ~(alignment - 1); |
1134 | 0 | size_t offset_from_alignment = offset - aligned_offset; |
1135 | 0 | file->seek(aligned_offset, SEEK_SET); |
1136 | | |
1137 | | // Calculate aligned read boundaries |
1138 | 0 | size_t read_start = aligned_offset; |
1139 | 0 | size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); |
1140 | |
|
1141 | 0 | size_t bytes_read = 0; |
1142 | 0 | size_t data_read = 0; // Actual tensor data copied (excluding padding) |
1143 | |
|
1144 | 0 | while (bytes_read < read_end - read_start) { |
1145 | 0 | size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read); |
1146 | | |
1147 | | // Align the destination pointer within the pinned buffer |
1148 | 0 | uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); |
1149 | | |
1150 | | // Wait for previous upload to complete before reusing buffer |
1151 | 0 | ggml_backend_event_synchronize(events[buffer_idx]); |
1152 | | |
1153 | | // Read aligned chunk from file |
1154 | 0 | file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size); |
1155 | | |
1156 | | // Calculate actual data portion (excluding alignment padding) |
1157 | 0 | uintptr_t ptr_data = ptr_dest_aligned; |
1158 | 0 | size_t data_to_copy = read_size; |
1159 | | |
1160 | | // Skip alignment padding at start of first chunk |
1161 | 0 | if (bytes_read == 0) { |
1162 | 0 | ptr_data += offset_from_alignment; |
1163 | 0 | data_to_copy -= offset_from_alignment; |
1164 | 0 | } |
1165 | | |
1166 | | // Trim alignment padding at end of last chunk |
1167 | 0 | if (aligned_offset + bytes_read + read_size > offset + n_size) { |
1168 | 0 | data_to_copy -= (read_end - (offset + n_size)); |
1169 | 0 | } |
1170 | | |
1171 | | // Async upload actual data to GPU |
1172 | 0 | ggml_backend_tensor_set_async(upload_backend, cur, |
1173 | 0 | reinterpret_cast<void *>(ptr_data), data_read, data_to_copy); |
1174 | 0 | ggml_backend_event_record(events[buffer_idx], upload_backend); |
1175 | |
|
1176 | 0 | data_read += data_to_copy; |
1177 | 0 | bytes_read += read_size; |
1178 | |
|
1179 | 0 | ++buffer_idx; |
1180 | 0 | buffer_idx %= n_buffers; |
1181 | 0 | } |
1182 | 0 | } else { |
1183 | 0 | read_buf.resize(n_size); |
1184 | 0 | file->seek(weight->offs, SEEK_SET); |
1185 | 0 | file->read_raw(read_buf.data(), n_size); |
1186 | 0 | ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); |
1187 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { |
1188 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1189 | 0 | } |
1190 | 0 | } |
1191 | 0 | } |
1192 | 0 | } |
1193 | | |
1194 | 0 | size_done += n_size; |
1195 | 0 | } |
1196 | | |
1197 | | // free temporary resources used for async uploads |
1198 | 0 | for (auto * event : events) { |
1199 | 0 | ggml_backend_event_synchronize(event); |
1200 | 0 | ggml_backend_event_free(event); |
1201 | 0 | } |
1202 | 0 | for (auto * buf : host_buffers) { |
1203 | 0 | ggml_backend_buffer_free(buf); |
1204 | 0 | } |
1205 | 0 | ggml_backend_free(upload_backend); |
1206 | | |
1207 | | // check validation results |
1208 | 0 | bool validation_failed = false; |
1209 | 0 | for (auto & future : validation_result) { |
1210 | 0 | auto result = future.get(); |
1211 | 0 | if (!result.second) { |
1212 | 0 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); |
1213 | 0 | validation_failed = true; |
1214 | 0 | } |
1215 | 0 | } |
1216 | 0 | if (validation_failed) { |
1217 | 0 | throw std::runtime_error("found tensors with invalid data"); |
1218 | 0 | } |
1219 | | |
1220 | | // check if this is the last call and do final cleanup |
1221 | 0 | if (size_done >= size_data) { |
1222 | | // unmap offloaded tensors and metadata |
1223 | 0 | if (use_mmap) { |
1224 | 0 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
1225 | 0 | const auto & mmap_used = mmaps_used.at(idx); |
1226 | 0 | auto & mapping = mappings.at(idx); |
1227 | 0 | mapping->unmap_fragment(0, mmap_used.first); |
1228 | 0 | if (mmap_used.second != 0) { |
1229 | 0 | mapping->unmap_fragment(mmap_used.second, mapping->size()); |
1230 | 0 | } |
1231 | 0 | } |
1232 | 0 | } |
1233 | 0 | if (progress_callback) { |
1234 | | // Even though the model is done loading, we still honor |
1235 | | // cancellation since we need to free allocations. |
1236 | 0 | return progress_callback(1.0f, progress_callback_user_data); |
1237 | 0 | } |
1238 | 0 | } |
1239 | | |
1240 | 0 | return true; |
1241 | 0 | } |
1242 | | |
1243 | 0 | std::string llama_model_loader::ftype_name() const { |
1244 | 0 | return llama_model_ftype_name(ftype); |
1245 | 0 | } |
1246 | | |
1247 | 90 | void llama_model_loader::print_info() const { |
1248 | 90 | LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); |
1249 | 90 | LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); |
1250 | 90 | if (n_bytes < GiB) { |
1251 | 90 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
1252 | 90 | } else { |
1253 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
1254 | 0 | } |
1255 | 90 | } |