/src/llama.cpp/src/llama-model-loader.cpp
Line | Count | Source |
1 | | #include "llama-model-loader.h" |
2 | | |
3 | | #include "ggml.h" |
4 | | |
5 | | #include <array> |
6 | | #include <cinttypes> |
7 | | #include <cstring> |
8 | | #include <future> |
9 | | |
10 | | static const size_t kiB = 1024; |
11 | | static const size_t MiB = 1024*kiB; |
12 | | static const size_t GiB = 1024*MiB; |
13 | | |
14 | 314 | const char * llama_file_version_name(llama_fver version) { |
15 | 314 | switch (version) { |
16 | 0 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; |
17 | 0 | case GGUF_FILE_VERSION_V2: return "GGUF V2"; |
18 | 314 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; |
19 | 314 | } |
20 | | |
21 | 0 | return "unknown"; |
22 | 314 | } |
23 | | |
24 | 314 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
25 | 314 | if (ftype & LLAMA_FTYPE_GUESSED) { |
26 | 157 | return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; |
27 | 157 | } |
28 | | |
29 | 157 | switch (ftype) { |
30 | 156 | case LLAMA_FTYPE_ALL_F32: return "all F32"; |
31 | 1 | case LLAMA_FTYPE_MOSTLY_F16: return "F16"; |
32 | 0 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |
33 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |
34 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |
35 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |
36 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |
37 | 0 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |
38 | 0 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; |
39 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; |
40 | 0 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |
41 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |
42 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |
43 | 0 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |
44 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |
45 | 0 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |
46 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |
47 | 0 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |
48 | 0 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; |
49 | 0 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; |
50 | 0 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; |
51 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; |
52 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |
53 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; |
54 | 0 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; |
55 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |
56 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; |
57 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; |
58 | 0 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; |
59 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |
60 | 0 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |
61 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; |
62 | 0 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; |
63 | | |
64 | 0 | default: return "unknown, may not work"; |
65 | 157 | } |
66 | 157 | } |
67 | | |
68 | | // return a list of splits for a given path |
69 | | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
70 | 0 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
71 | 0 | std::vector<std::string> paths; |
72 | 0 | std::string split_prefix; |
73 | 0 | std::vector<char> buf(llama_path_max(), 0); |
74 | |
|
75 | 0 | { |
76 | 0 | int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); |
77 | 0 | if (!ret) { |
78 | 0 | throw std::runtime_error(format("invalid split file name: %s", path.c_str())); |
79 | 0 | } |
80 | 0 | split_prefix = std::string(buf.data(), ret); |
81 | 0 | } |
82 | | |
83 | 0 | if (split_prefix.empty()) { |
84 | 0 | throw std::runtime_error(format("invalid split file: %s", path.c_str())); |
85 | 0 | } |
86 | | |
87 | 0 | for (int idx = 0; idx < n_split; ++idx) { |
88 | 0 | int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); |
89 | 0 | paths.push_back(std::string(buf.data(), ret)); |
90 | 0 | } |
91 | |
|
92 | 0 | return paths; |
93 | 0 | } |
94 | | |
95 | | namespace GGUFMeta { |
96 | | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
97 | | struct GKV_Base_Type { |
98 | | static constexpr gguf_type gt = gt_; |
99 | | |
100 | 0 | static T getter(const gguf_context * ctx, const int kid) { |
101 | 0 | return gfun(ctx, kid); |
102 | 0 | } Unexecuted instantiation: GGUFMeta::GKV_Base_Type<bool, (gguf_type)7, &gguf_get_val_bool>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<float, (gguf_type)6, &gguf_get_val_f32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned int, (gguf_type)4, &gguf_get_val_u32>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<unsigned short, (gguf_type)2, &gguf_get_val_u16>::getter(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV_Base_Type<int, (gguf_type)5, &gguf_get_val_i32>::getter(gguf_context const*, int) |
103 | | }; |
104 | | |
105 | | template<typename T> struct GKV_Base; |
106 | | |
107 | | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
108 | | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
109 | | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
110 | | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
111 | | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
112 | | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
113 | | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
114 | | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
115 | | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
116 | | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
117 | | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
118 | | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
119 | | |
120 | | template<> struct GKV_Base<std::string> { |
121 | | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
122 | | |
123 | 3 | static std::string getter(const gguf_context * ctx, const int kid) { |
124 | 3 | return gguf_get_val_str(ctx, kid); |
125 | 3 | } |
126 | | }; |
127 | | |
128 | | struct ArrayInfo { |
129 | | const gguf_type gt; |
130 | | const size_t length; |
131 | | const void * data; |
132 | | }; |
133 | | |
134 | | template<> struct GKV_Base<ArrayInfo> { |
135 | | public: |
136 | | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
137 | 0 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
138 | 0 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); |
139 | 0 | return ArrayInfo { |
140 | 0 | arr_type, |
141 | 0 | size_t(gguf_get_arr_n(ctx, k)), |
142 | 0 | arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), |
143 | 0 | }; |
144 | 0 | } |
145 | | }; |
146 | | |
147 | | template<typename T> |
148 | | class GKV : public GKV_Base<T> { |
149 | | GKV() = delete; |
150 | | |
151 | | public: |
152 | 6 | static T get_kv(const gguf_context * ctx, const int k) { |
153 | 6 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); |
154 | | |
155 | 6 | if (kt != GKV::gt) { |
156 | 3 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", |
157 | 3 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); |
158 | 3 | } |
159 | 3 | return GKV::getter(ctx, k); |
160 | 6 | } Unexecuted instantiation: GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<bool>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<float>::get_kv(gguf_context const*, int) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::get_kv(gguf_context const*, int) GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::get_kv(gguf_context const*, int) Line | Count | Source | 152 | 5 | static T get_kv(const gguf_context * ctx, const int k) { | 153 | 5 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); | 154 | | | 155 | 5 | if (kt != GKV::gt) { | 156 | 2 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", | 157 | 2 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); | 158 | 2 | } | 159 | 3 | return GKV::getter(ctx, k); | 160 | 5 | } |
GGUFMeta::GKV<unsigned short>::get_kv(gguf_context const*, int) Line | Count | Source | 152 | 1 | static T get_kv(const gguf_context * ctx, const int k) { | 153 | 1 | const enum gguf_type kt = gguf_get_kv_type(ctx, k); | 154 | | | 155 | 1 | if (kt != GKV::gt) { | 156 | 1 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s", | 157 | 1 | gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); | 158 | 1 | } | 159 | 0 | return GKV::getter(ctx, k); | 160 | 1 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::get_kv(gguf_context const*, int) |
161 | | |
162 | 0 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
163 | 0 | switch (ty) { |
164 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; |
165 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; |
166 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; |
167 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; |
168 | 0 | } |
169 | 0 | return "unknown"; |
170 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<float>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned int>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<unsigned short>::override_type_to_str(llama_model_kv_override_type) Unexecuted instantiation: GGUFMeta::GKV<int>::override_type_to_str(llama_model_kv_override_type) |
171 | | |
172 | 560 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
173 | 560 | if (!ovrd) { return false; } |
174 | 0 | if (ovrd->tag == expected_type) { |
175 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", |
176 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
177 | 0 | switch (ovrd->tag) { |
178 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
179 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); |
180 | 0 | } break; |
181 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
182 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); |
183 | 0 | } break; |
184 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
185 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); |
186 | 0 | } break; |
187 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
188 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); |
189 | 0 | } break; |
190 | 0 | default: |
191 | | // Shouldn't be possible to end up here, but just in case... |
192 | 0 | throw std::runtime_error( |
193 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", |
194 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); |
195 | 0 | } |
196 | 0 | return true; |
197 | 0 | } |
198 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", |
199 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
200 | 0 | return false; |
201 | 0 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 172 | 160 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 173 | 160 | if (!ovrd) { return false; } | 174 | 0 | if (ovrd->tag == expected_type) { | 175 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 176 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 177 | 0 | switch (ovrd->tag) { | 178 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 179 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 180 | 0 | } break; | 181 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 182 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 183 | 0 | } break; | 184 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 185 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 186 | 0 | } break; | 187 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 188 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 189 | 0 | } break; | 190 | 0 | default: | 191 | | // Shouldn't be possible to end up here, but just in case... | 192 | 0 | throw std::runtime_error( | 193 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 194 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 195 | 0 | } | 196 | 0 | return true; | 197 | 0 | } | 198 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 199 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 200 | 0 | return false; | 201 | 0 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 172 | 242 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 173 | 242 | if (!ovrd) { return false; } | 174 | 0 | if (ovrd->tag == expected_type) { | 175 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 176 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 177 | 0 | switch (ovrd->tag) { | 178 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 179 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 180 | 0 | } break; | 181 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 182 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 183 | 0 | } break; | 184 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 185 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 186 | 0 | } break; | 187 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 188 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 189 | 0 | } break; | 190 | 0 | default: | 191 | | // Shouldn't be possible to end up here, but just in case... | 192 | 0 | throw std::runtime_error( | 193 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 194 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 195 | 0 | } | 196 | 0 | return true; | 197 | 0 | } | 198 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 199 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 200 | 0 | return false; | 201 | 0 | } |
GGUFMeta::GKV<unsigned short>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) Line | Count | Source | 172 | 158 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { | 173 | 158 | if (!ovrd) { return false; } | 174 | 0 | if (ovrd->tag == expected_type) { | 175 | 0 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", | 176 | 0 | __func__, override_type_to_str(ovrd->tag), ovrd->key); | 177 | 0 | switch (ovrd->tag) { | 178 | 0 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { | 179 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); | 180 | 0 | } break; | 181 | 0 | case LLAMA_KV_OVERRIDE_TYPE_INT: { | 182 | 0 | LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); | 183 | 0 | } break; | 184 | 0 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { | 185 | 0 | LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); | 186 | 0 | } break; | 187 | 0 | case LLAMA_KV_OVERRIDE_TYPE_STR: { | 188 | 0 | LLAMA_LOG_INFO("%s\n", ovrd->val_str); | 189 | 0 | } break; | 190 | 0 | default: | 191 | | // Shouldn't be possible to end up here, but just in case... | 192 | 0 | throw std::runtime_error( | 193 | 0 | format("Unsupported attempt to override %s type for metadata key %s\n", | 194 | 0 | override_type_to_str(ovrd->tag), ovrd->key)); | 195 | 0 | } | 196 | 0 | return true; | 197 | 0 | } | 198 | 0 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", | 199 | 0 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); | 200 | 0 | return false; | 201 | 0 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::validate_override(llama_model_kv_override_type, llama_model_kv_override const*) |
202 | | |
203 | | template<typename OT> |
204 | | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
205 | 0 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
206 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
207 | 0 | target = ovrd->val_bool; |
208 | 0 | return true; |
209 | 0 | } |
210 | 0 | return false; |
211 | 0 | } |
212 | | |
213 | | template<typename OT> |
214 | | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
215 | 318 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
216 | 318 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
217 | 0 | target = ovrd->val_i64; |
218 | 0 | return true; |
219 | 0 | } |
220 | 318 | return false; |
221 | 318 | } _ZN8GGUFMeta3GKVIjE12try_overrideIjEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Line | Count | Source | 215 | 160 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { | 216 | 160 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { | 217 | 0 | target = ovrd->val_i64; | 218 | 0 | return true; | 219 | 0 | } | 220 | 160 | return false; | 221 | 160 | } |
_ZN8GGUFMeta3GKVItE12try_overrideItEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override Line | Count | Source | 215 | 158 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { | 216 | 158 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { | 217 | 0 | target = ovrd->val_i64; | 218 | 0 | return true; | 219 | 0 | } | 220 | 158 | return false; | 221 | 158 | } |
Unexecuted instantiation: _ZN8GGUFMeta3GKVIiE12try_overrideIiEENSt3__19enable_ifIXaantsr3std7is_sameIT_bEE5valuesr3std11is_integralIS5_EE5valueEbE4typeERS5_PK23llama_model_kv_override |
222 | | |
223 | | template<typename OT> |
224 | | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
225 | 0 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
226 | 0 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
227 | 0 | target = ovrd->val_f64; |
228 | 0 | return true; |
229 | 0 | } |
230 | 0 | return false; |
231 | 0 | } |
232 | | |
233 | | template<typename OT> |
234 | | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
235 | 242 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
236 | 242 | if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
237 | 0 | target = ovrd->val_str; |
238 | 0 | return true; |
239 | 0 | } |
240 | 242 | return false; |
241 | 242 | } |
242 | | |
243 | 560 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
244 | 560 | if (try_override<T>(target, ovrd)) { |
245 | 0 | return true; |
246 | 0 | } |
247 | 560 | if (k < 0) { return false; } |
248 | 6 | target = get_kv(ctx, k); |
249 | 6 | return true; |
250 | 560 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, int, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, int, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, int, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 243 | 160 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 244 | 160 | if (try_override<T>(target, ovrd)) { | 245 | 0 | return true; | 246 | 0 | } | 247 | 160 | if (k < 0) { return false; } | 248 | 0 | target = get_kv(ctx, k); | 249 | 0 | return true; | 250 | 160 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 243 | 242 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 244 | 242 | if (try_override<T>(target, ovrd)) { | 245 | 0 | return true; | 246 | 0 | } | 247 | 242 | if (k < 0) { return false; } | 248 | 5 | target = get_kv(ctx, k); | 249 | 5 | return true; | 250 | 242 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, int, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 243 | 158 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 244 | 158 | if (try_override<T>(target, ovrd)) { | 245 | 0 | return true; | 246 | 0 | } | 247 | 158 | if (k < 0) { return false; } | 248 | 1 | target = get_kv(ctx, k); | 249 | 1 | return true; | 250 | 158 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, int, int&, llama_model_kv_override const*) |
251 | | |
252 | 560 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
253 | 560 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
254 | 560 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, char const*, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, char const*, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, char const*, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 252 | 160 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 253 | 160 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 254 | 160 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 252 | 242 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 253 | 242 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 254 | 242 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, char const*, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 252 | 158 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 253 | 158 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); | 254 | 158 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, char const*, int&, llama_model_kv_override const*) |
255 | | |
256 | 560 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
257 | 560 | return set(ctx, key.c_str(), target, ovrd); |
258 | 560 | } Unexecuted instantiation: GGUFMeta::GKV<bool>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, llama_model_kv_override const*) Unexecuted instantiation: GGUFMeta::GKV<float>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, llama_model_kv_override const*) GGUFMeta::GKV<unsigned int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, llama_model_kv_override const*) Line | Count | Source | 256 | 160 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 257 | 160 | return set(ctx, key.c_str(), target, ovrd); | 258 | 160 | } |
GGUFMeta::GKV<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, llama_model_kv_override const*) Line | Count | Source | 256 | 242 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 257 | 242 | return set(ctx, key.c_str(), target, ovrd); | 258 | 242 | } |
GGUFMeta::GKV<unsigned short>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, llama_model_kv_override const*) Line | Count | Source | 256 | 158 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { | 257 | 158 | return set(ctx, key.c_str(), target, ovrd); | 258 | 158 | } |
Unexecuted instantiation: GGUFMeta::GKV<int>::set(gguf_context const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, llama_model_kv_override const*) |
259 | | }; |
260 | | } |
261 | | |
262 | | template<typename T> |
263 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
264 | 0 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
265 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
266 | |
|
267 | 0 | if (kid < 0) { |
268 | 0 | if (required) { |
269 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
270 | 0 | } |
271 | 0 | return false; |
272 | 0 | } |
273 | | |
274 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
275 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
276 | | |
277 | |
|
278 | 0 | result = arr_info.length; |
279 | 0 | return true; |
280 | 0 | } |
281 | | |
282 | | template<typename T> |
283 | | typename std::enable_if<std::is_integral<T>::value, bool>::type |
284 | 0 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
285 | 0 | return get_arr_n(llm_kv(kid), result, required); |
286 | 0 | } |
287 | | |
288 | | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
289 | | |
290 | | template<typename T> |
291 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
292 | 0 | const gguf_context * ctx = meta.get(); |
293 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
294 | |
|
295 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
296 | 0 | if (required) { |
297 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
298 | 0 | } |
299 | 0 | return false; |
300 | 0 | } |
301 | | |
302 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
303 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
304 | |
|
305 | 0 | switch (arr_info.gt) { |
306 | 0 | case GGUF_TYPE_UINT32: |
307 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
308 | 0 | (std::is_same<T, uint32_t>::value)); break; |
309 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
310 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
311 | 0 | default: |
312 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
313 | 0 | } |
314 | | |
315 | 0 | if constexpr (std::is_same<T, std::string>::value) { |
316 | 0 | const size_t n_items = gguf_get_arr_n(ctx, kid); |
317 | 0 | result.clear(); |
318 | |
|
319 | 0 | for (size_t i = 0; i < n_items; i++) { |
320 | 0 | const T value = gguf_get_arr_str(ctx, kid, i); |
321 | 0 | result.emplace_back(value); |
322 | 0 | } |
323 | | } else { |
324 | | result.resize(arr_info.length); |
325 | | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
326 | | } |
327 | |
|
328 | 0 | return true; |
329 | 0 | } |
330 | | |
331 | | template<typename T, size_t N_MAX> |
332 | 0 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
333 | 0 | const gguf_context * ctx = meta.get(); |
334 | 0 | const int kid = gguf_find_key(ctx, key.c_str()); |
335 | |
|
336 | 0 | if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { |
337 | 0 | if (required) { |
338 | 0 | throw std::runtime_error(format("array key not found in model: %s", key.c_str())); |
339 | 0 | } |
340 | 0 | return false; |
341 | 0 | } |
342 | | |
343 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
344 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid); |
345 | |
|
346 | 0 | switch (arr_info.gt) { |
347 | 0 | case GGUF_TYPE_UINT32: |
348 | 0 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
349 | 0 | (std::is_same<T, uint32_t>::value)); break; |
350 | 0 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
351 | 0 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
352 | 0 | default: |
353 | 0 | throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); |
354 | 0 | } |
355 | | |
356 | 0 | if (arr_info.length > N_MAX) { |
357 | 0 | throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
358 | 0 | } |
359 | | |
360 | | if constexpr (std::is_same<T, std::string>::value) { |
361 | | const size_t n_items = gguf_get_arr_n(ctx, kid); |
362 | | |
363 | | for (size_t i = 0; i < n_items; i++) { |
364 | | const T value = gguf_get_arr_str(ctx, kid, i); |
365 | | result[i] = value; |
366 | | } |
367 | 0 | } else { |
368 | 0 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
369 | 0 | } |
370 | |
|
371 | 0 | return true; |
372 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, bool) Unexecuted instantiation: bool llama_model_loader::get_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, bool) |
373 | | |
374 | | template<typename T> |
375 | 0 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
376 | 0 | return get_arr(llm_kv(kid), result, required); |
377 | 0 | } |
378 | | |
379 | | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
380 | | |
381 | | template<typename T> |
382 | 560 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
383 | 560 | auto it = kv_overrides.find(key); |
384 | | |
385 | 560 | const struct llama_model_kv_override * override = |
386 | 560 | it != kv_overrides.end() ? &it->second : nullptr; |
387 | | |
388 | 560 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); |
389 | | |
390 | 560 | if (required && !found) { |
391 | 3 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
392 | 3 | } |
393 | | |
394 | 557 | return found; |
395 | 560 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, float&, bool) bool llama_model_loader::get_key<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned int&, bool) Line | Count | Source | 382 | 160 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 383 | 160 | auto it = kv_overrides.find(key); | 384 | | | 385 | 160 | const struct llama_model_kv_override * override = | 386 | 160 | it != kv_overrides.end() ? &it->second : nullptr; | 387 | | | 388 | 160 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 389 | | | 390 | 160 | if (required && !found) { | 391 | 3 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 392 | 3 | } | 393 | | | 394 | 157 | return found; | 395 | 160 | } |
bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Line | Count | Source | 382 | 242 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 383 | 242 | auto it = kv_overrides.find(key); | 384 | | | 385 | 242 | const struct llama_model_kv_override * override = | 386 | 242 | it != kv_overrides.end() ? &it->second : nullptr; | 387 | | | 388 | 242 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 389 | | | 390 | 242 | if (required && !found) { | 391 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 392 | 0 | } | 393 | | | 394 | 242 | return found; | 395 | 242 | } |
bool llama_model_loader::get_key<unsigned short>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, unsigned short&, bool) Line | Count | Source | 382 | 158 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { | 383 | 158 | auto it = kv_overrides.find(key); | 384 | | | 385 | 158 | const struct llama_model_kv_override * override = | 386 | 158 | it != kv_overrides.end() ? &it->second : nullptr; | 387 | | | 388 | 158 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); | 389 | | | 390 | 158 | if (required && !found) { | 391 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); | 392 | 0 | } | 393 | | | 394 | 158 | return found; | 395 | 158 | } |
Unexecuted instantiation: bool llama_model_loader::get_key<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int&, bool) |
396 | | |
397 | | template<typename T> |
398 | 163 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
399 | 163 | return get_key(llm_kv(kid), result, required); |
400 | 163 | } Unexecuted instantiation: bool llama_model_loader::get_key<bool>(llm_kv, bool&, bool) Unexecuted instantiation: bool llama_model_loader::get_key<float>(llm_kv, float&, bool) bool llama_model_loader::get_key<unsigned int>(llm_kv, unsigned int&, bool) Line | Count | Source | 398 | 160 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { | 399 | 160 | return get_key(llm_kv(kid), result, required); | 400 | 160 | } |
bool llama_model_loader::get_key<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(llm_kv, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, bool) Line | Count | Source | 398 | 3 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { | 399 | 3 | return get_key(llm_kv(kid), result, required); | 400 | 3 | } |
|
401 | | |
402 | | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
403 | | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
404 | | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
405 | | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
406 | | |
407 | | template<> |
408 | 0 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
409 | 0 | uint32_t tmp; |
410 | 0 | const bool found = get_key(kid, tmp, required); |
411 | 0 | if (found) { |
412 | 0 | result = (enum llama_pooling_type) tmp; |
413 | 0 | } else { |
414 | 0 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
415 | 0 | } |
416 | 0 | return found; |
417 | 0 | } |
418 | | |
419 | | // get array of n <= N_MAX elements, or a single element repeated n times |
420 | | template<typename T, size_t N_MAX> |
421 | 0 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
422 | 0 | const int kid = gguf_find_key(meta.get(), key.c_str()); |
423 | |
|
424 | 0 | if (kid < 0) { |
425 | 0 | if (required) { |
426 | 0 | throw std::runtime_error(format("key not found in model: %s", key.c_str())); |
427 | 0 | } |
428 | 0 | return false; |
429 | 0 | } |
430 | | |
431 | 0 | if (n > N_MAX) { |
432 | 0 | throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
433 | 0 | } |
434 | | |
435 | 0 | if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { |
436 | 0 | struct GGUFMeta::ArrayInfo arr_info = |
437 | 0 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid); |
438 | |
|
439 | 0 | if (n != arr_info.length) { |
440 | 0 | throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); |
441 | 0 | } |
442 | | |
443 | 0 | return get_arr(key, result, required); |
444 | 0 | } |
445 | | |
446 | 0 | T value; |
447 | |
|
448 | 0 | bool ok = get_key(key, value, required); |
449 | 0 | if (!ok) { |
450 | 0 | return false; |
451 | 0 | } |
452 | | |
453 | 0 | for (uint32_t i = 0; i < n; i++) { |
454 | 0 | result[i] = value; |
455 | 0 | } |
456 | |
|
457 | 0 | return true; |
458 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<int, 4ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<unsigned int, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<float, 512ul>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::array<float, 512ul>&, unsigned int, bool) |
459 | | |
460 | | template<typename T> |
461 | 0 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
462 | 0 | return get_key_or_arr(llm_kv(kid), result, n, required); |
463 | 0 | } Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<int, 4ul> >(llm_kv, std::__1::array<int, 4ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<unsigned int, 512ul> >(llm_kv, std::__1::array<unsigned int, 512ul>&, unsigned int, bool) Unexecuted instantiation: bool llama_model_loader::get_key_or_arr<std::__1::array<float, 512ul> >(llm_kv, std::__1::array<float, 512ul>&, unsigned int, bool) |
464 | | |
465 | | // TODO: this is not very clever - figure out something better |
466 | | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
467 | | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
468 | | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
469 | | |
470 | | |
471 | | llama_model_loader::llama_model_loader( |
472 | | const std::string & fname, |
473 | | std::vector<std::string> & splits, |
474 | | bool use_mmap, |
475 | | bool check_tensors, |
476 | | const llama_model_kv_override * param_overrides_p, |
477 | 862 | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { |
478 | 862 | int trace = 0; |
479 | 862 | if (getenv("LLAMA_TRACE")) { |
480 | 0 | trace = atoi(getenv("LLAMA_TRACE")); |
481 | 0 | } |
482 | | |
483 | 862 | if (param_overrides_p != nullptr) { |
484 | 0 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
485 | 0 | kv_overrides.insert({std::string(p->key), *p}); |
486 | 0 | } |
487 | 0 | } |
488 | | |
489 | 862 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
490 | | |
491 | | // Load the main GGUF |
492 | 862 | struct ggml_context * ctx = NULL; |
493 | 862 | struct gguf_init_params params = { |
494 | 862 | /*.no_alloc = */ true, |
495 | 862 | /*.ctx = */ &ctx, |
496 | 862 | }; |
497 | | |
498 | 862 | meta.reset(gguf_init_from_file(fname.c_str(), params)); |
499 | 862 | if (!meta) { |
500 | 568 | throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); |
501 | 568 | } |
502 | | |
503 | 294 | get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); |
504 | 294 | llm_kv = LLM_KV(llm_arch_from_string(arch_name)); |
505 | | |
506 | 294 | files.emplace_back(new llama_file(fname.c_str(), "rb")); |
507 | 294 | contexts.emplace_back(ctx); |
508 | | |
509 | | // Save tensors data offset of the main file. |
510 | | // For subsidiary files, `meta` tensor data offset must not be used, |
511 | | // so we build a unified tensors index for weights. |
512 | 932 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
513 | 638 | std::string tensor_name = std::string(cur->name); |
514 | | // make sure there is no duplicated tensor names |
515 | 638 | if (weights_map.find(tensor_name) != weights_map.end()) { |
516 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
517 | 0 | } |
518 | 638 | n_elements += ggml_nelements(cur); |
519 | 638 | n_bytes += ggml_nbytes(cur); |
520 | 638 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); |
521 | 638 | } |
522 | 294 | uint16_t n_split = 0; |
523 | 294 | get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); |
524 | | |
525 | | // Load additional GGML contexts |
526 | 294 | if (n_split > 1) { |
527 | | // make sure the main file is loaded first |
528 | 0 | uint16_t idx = 0; |
529 | 0 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
530 | 0 | get_key(kv_split_no, idx); |
531 | 0 | if (idx != 0) { |
532 | 0 | throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); |
533 | 0 | } |
534 | | |
535 | | // generate list of splits if needed |
536 | 0 | if (splits.empty()) { |
537 | 0 | splits = llama_get_list_splits(fname, idx, n_split); |
538 | 0 | } |
539 | | |
540 | | // in case user give a custom list of splits, check if it matches the expected number |
541 | 0 | if (n_split != (uint16_t)splits.size()) { |
542 | 0 | throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); |
543 | 0 | } |
544 | | |
545 | 0 | if (trace > 0) { |
546 | 0 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); |
547 | 0 | } |
548 | | |
549 | | // load other splits |
550 | 0 | for (idx = 1; idx < n_split; idx++) { |
551 | 0 | const char * fname_split = splits[idx].c_str(); |
552 | |
|
553 | 0 | struct gguf_init_params split_params = { |
554 | 0 | /*.no_alloc = */ true, |
555 | 0 | /*.ctx = */ &ctx, |
556 | 0 | }; |
557 | 0 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; |
558 | 0 | if (!ctx_gguf) { |
559 | 0 | throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); |
560 | 0 | } |
561 | | |
562 | | // check idx |
563 | 0 | { |
564 | 0 | const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); |
565 | 0 | if (kid < 0) { |
566 | 0 | throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); |
567 | 0 | } |
568 | 0 | int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); |
569 | 0 | if (idx_gguf != idx) { |
570 | 0 | throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); |
571 | 0 | } |
572 | 0 | } |
573 | | |
574 | 0 | files.emplace_back(new llama_file(fname_split, "rb")); |
575 | 0 | contexts.emplace_back(ctx); |
576 | | |
577 | | // Save tensors data offset info of the shard. |
578 | 0 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { |
579 | 0 | std::string tensor_name = std::string(cur->name); |
580 | | // make sure there is no duplicated tensor names |
581 | 0 | if (weights_map.find(tensor_name) != weights_map.end()) { |
582 | 0 | throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); |
583 | 0 | } |
584 | 0 | n_elements += ggml_nelements(cur); |
585 | 0 | n_bytes += ggml_nbytes(cur); |
586 | 0 | weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
587 | 0 | } |
588 | 0 | } |
589 | | |
590 | 0 | get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); |
591 | | |
592 | | // sanity check |
593 | 0 | { |
594 | 0 | const int n_tensors_loaded = (int) weights_map.size(); |
595 | 0 | if (n_tensors != n_tensors_loaded) { |
596 | 0 | throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); |
597 | 0 | } |
598 | 0 | } |
599 | | |
600 | 0 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); |
601 | 0 | } |
602 | | |
603 | 294 | n_kv = gguf_get_n_kv(meta.get()); |
604 | 294 | n_tensors = weights_map.size(); |
605 | | |
606 | 294 | fver = (enum llama_fver) gguf_get_version(meta.get()); |
607 | | |
608 | 294 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", |
609 | 294 | __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); |
610 | | |
611 | | // determine file type based on the number of tensors for each quantization and print meta data |
612 | | // TODO: make optional |
613 | 294 | { |
614 | 294 | std::map<enum ggml_type, uint32_t> n_type; |
615 | | |
616 | 294 | uint32_t n_type_max = 0; |
617 | 294 | enum ggml_type type_max = GGML_TYPE_F32; |
618 | | |
619 | 294 | for (const auto & it : weights_map) { |
620 | 207 | const llama_tensor_weight & w = it.second; |
621 | 207 | const ggml_tensor * tensor = w.tensor; |
622 | | |
623 | 207 | enum ggml_type type = tensor->type; |
624 | | |
625 | 207 | n_type[type]++; |
626 | | |
627 | 207 | if (n_type_max < n_type[type]) { |
628 | 189 | n_type_max = n_type[type]; |
629 | 189 | type_max = type; |
630 | 189 | } |
631 | | |
632 | 207 | if (trace > 0) { |
633 | 0 | const uint16_t sid = w.idx; |
634 | 0 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, |
635 | 0 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
636 | 0 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
637 | 0 | } |
638 | 207 | } |
639 | | |
640 | 294 | switch (type_max) { |
641 | 154 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
642 | 1 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
643 | 0 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
644 | 0 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
645 | 0 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
646 | 0 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
647 | 0 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
648 | 0 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
649 | 0 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
650 | 0 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
651 | 0 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
652 | 0 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
653 | 0 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
654 | 0 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
655 | 0 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
656 | 0 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
657 | 0 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
658 | 0 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
659 | 0 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
660 | 0 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
661 | 0 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
662 | 0 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
663 | 0 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
664 | 0 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
665 | 2 | default: |
666 | 2 | { |
667 | 2 | LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); |
668 | 2 | ftype = LLAMA_FTYPE_ALL_F32; |
669 | 2 | } break; |
670 | 294 | } |
671 | | |
672 | | // this is a way to mark that we have "guessed" the file type |
673 | 157 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
674 | | |
675 | 157 | { |
676 | 157 | uint32_t ftype_val = 0; |
677 | 157 | if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { |
678 | 0 | ftype = (llama_ftype) ftype_val; |
679 | 0 | } |
680 | 157 | } |
681 | | |
682 | 157 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); |
683 | | |
684 | 1.48k | for (int i = 0; i < n_kv; i++) { |
685 | 1.32k | const char * name = gguf_get_key(meta.get(), i); |
686 | 1.32k | const enum gguf_type type = gguf_get_kv_type(meta.get(), i); |
687 | 1.32k | const std::string type_name = |
688 | 1.32k | type == GGUF_TYPE_ARRAY |
689 | 1.32k | ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) |
690 | 1.32k | : gguf_type_name(type); |
691 | | |
692 | 1.32k | std::string value = gguf_kv_to_str(meta.get(), i); |
693 | 1.32k | const size_t MAX_VALUE_LEN = 40; |
694 | 1.32k | if (value.size() > MAX_VALUE_LEN) { |
695 | 69 | value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); |
696 | 69 | } |
697 | 1.32k | replace_all(value, "\n", "\\n"); |
698 | | |
699 | 1.32k | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); |
700 | 1.32k | } |
701 | | |
702 | | // print type counts |
703 | 157 | for (auto & kv : n_type) { |
704 | 69 | if (kv.second == 0) { |
705 | 0 | continue; |
706 | 0 | } |
707 | | |
708 | 69 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); |
709 | 69 | } |
710 | 157 | } |
711 | | |
712 | 157 | if (!llama_mmap::SUPPORTED) { |
713 | 0 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); |
714 | 0 | use_mmap = false; |
715 | 0 | } |
716 | | |
717 | 157 | this->use_mmap = use_mmap; |
718 | 157 | this->check_tensors = check_tensors; |
719 | 157 | } |
720 | | |
721 | 154 | std::string llama_model_loader::get_arch_name() const { |
722 | 154 | return arch_name; |
723 | 154 | } |
724 | | |
725 | 160 | enum llm_arch llama_model_loader::get_arch() const { |
726 | 160 | return llm_kv.arch; |
727 | 160 | } |
728 | | |
729 | 0 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
730 | 0 | auto pos = weights_map.find(name); |
731 | 0 | if (pos != weights_map.end()) { |
732 | 0 | return &pos->second; |
733 | 0 | } |
734 | | |
735 | 0 | return nullptr; |
736 | 0 | } |
737 | | |
738 | 0 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
739 | 0 | const llama_tensor_weight * weight = get_weight(name); |
740 | 0 | if (!weight) { |
741 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); |
742 | 0 | } |
743 | 0 | return *weight; |
744 | 0 | } |
745 | | |
746 | 0 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
747 | 0 | const auto * weight = get_weight(name); |
748 | 0 | if (!weight) { |
749 | 0 | return nullptr; |
750 | 0 | } |
751 | 0 | return weight->tensor; |
752 | 0 | } |
753 | | |
754 | 0 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
755 | 0 | struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); |
756 | 0 | if (!tensor) { |
757 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
758 | 0 | } |
759 | 0 | return tensor; |
760 | 0 | } |
761 | | |
762 | 0 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
763 | 0 | const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); |
764 | |
|
765 | 0 | if (cur == NULL) { |
766 | 0 | if (!required) { |
767 | 0 | return NULL; |
768 | 0 | } |
769 | 0 | throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); |
770 | 0 | } |
771 | | |
772 | 0 | { |
773 | 0 | bool is_ok = true; |
774 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
775 | 0 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
776 | 0 | is_ok = false; |
777 | 0 | break; |
778 | 0 | } |
779 | 0 | } |
780 | 0 | if (!is_ok) { |
781 | 0 | throw std::runtime_error( |
782 | 0 | format("%s: tensor '%s' has wrong shape; expected %s, got %s", |
783 | 0 | __func__, name.c_str(), |
784 | 0 | llama_format_tensor_shape(ne).c_str(), |
785 | 0 | llama_format_tensor_shape(cur).c_str())); |
786 | 0 | } |
787 | 0 | } |
788 | | |
789 | 0 | return cur; |
790 | 0 | } |
791 | | |
792 | 0 | struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) { |
793 | 0 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str()); |
794 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); |
795 | |
|
796 | 0 | if (cur == NULL) { |
797 | 0 | return NULL; |
798 | 0 | } |
799 | | |
800 | 0 | bool duplicated = flags & TENSOR_DUPLICATED; |
801 | |
|
802 | 0 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); |
803 | 0 | ggml_set_name(tensor, ggml_get_name(cur)); |
804 | |
|
805 | 0 | if (duplicated) { |
806 | 0 | size_data += ggml_nbytes(cur); |
807 | 0 | } else { |
808 | 0 | n_created++; |
809 | 0 | } |
810 | |
|
811 | 0 | return tensor; |
812 | |
|
813 | 0 | } |
814 | | |
815 | 0 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
816 | 0 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
817 | |
|
818 | 0 | if (cur == NULL) { |
819 | 0 | return NULL; |
820 | 0 | } |
821 | | |
822 | 0 | if (cur->type != base->type) { |
823 | 0 | throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); |
824 | 0 | } |
825 | | |
826 | 0 | std::array<int64_t, GGML_MAX_DIMS> dims; |
827 | 0 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
828 | 0 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
829 | 0 | } |
830 | |
|
831 | 0 | struct ggml_tensor * tensor = ggml_view_4d(ctx, base, |
832 | 0 | dims[0], dims[1], dims[2], dims[3], |
833 | 0 | cur->nb[1], cur->nb[2], cur->nb[3], |
834 | 0 | offset); |
835 | |
|
836 | 0 | ggml_set_name(tensor, name.c_str()); |
837 | |
|
838 | 0 | n_created++; |
839 | |
|
840 | 0 | return tensor; |
841 | 0 | } |
842 | | |
843 | 0 | void llama_model_loader::done_getting_tensors() const { |
844 | 0 | if (n_created != n_tensors) { |
845 | 0 | throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); |
846 | 0 | } |
847 | 0 | } |
848 | | |
849 | 0 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
850 | 0 | if (use_mmap) { |
851 | 0 | mappings.reserve(files.size()); |
852 | 0 | mmaps_used.reserve(files.size()); |
853 | 0 | for (const auto & file : files) { |
854 | 0 | bool is_numa = false; |
855 | |
|
856 | 0 | auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
857 | 0 | if (dev) { |
858 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
859 | 0 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); |
860 | 0 | if (is_numa_fn) { |
861 | 0 | is_numa = is_numa_fn(); |
862 | 0 | } |
863 | 0 | } |
864 | |
|
865 | 0 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa); |
866 | 0 | mmaps_used.emplace_back(mapping->size(), 0); |
867 | 0 | if (mlock_mmaps) { |
868 | 0 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
869 | 0 | mlock_mmap->init(mapping->addr()); |
870 | 0 | mlock_mmaps->emplace_back(std::move(mlock_mmap)); |
871 | 0 | } |
872 | 0 | mappings.emplace_back(std::move(mapping)); |
873 | 0 | } |
874 | 0 | } |
875 | | |
876 | | // compute the total size of all tensors for progress reporting |
877 | 0 | for (const auto & it : weights_map) { |
878 | 0 | size_data += ggml_nbytes(it.second.tensor); |
879 | 0 | } |
880 | 0 | } |
881 | | |
882 | 0 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
883 | 0 | GGML_ASSERT(!mappings.empty()); |
884 | 0 | const auto & mapping = mappings.at(idx); |
885 | |
|
886 | 0 | *first = mapping->size(); |
887 | 0 | *last = 0; |
888 | 0 | *addr = mapping->addr(); |
889 | 0 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
890 | 0 | const auto * weight = get_weight(ggml_get_name(tensor)); |
891 | 0 | if (!weight || weight->idx != idx) { |
892 | 0 | continue; |
893 | 0 | } |
894 | 0 | *first = std::min(*first, weight->offs); |
895 | 0 | *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); |
896 | 0 | } |
897 | 0 | } |
898 | | |
899 | 0 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
900 | 0 | const auto & w = require_weight(ggml_get_name(cur)); |
901 | |
|
902 | 0 | if (use_mmap) { |
903 | 0 | const auto & mapping = mappings.at(w.idx); |
904 | 0 | if (cur->data == nullptr) { |
905 | 0 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
906 | 0 | } else { |
907 | 0 | memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); |
908 | 0 | } |
909 | 0 | } else { |
910 | 0 | GGML_ASSERT(cur->data != nullptr); |
911 | 0 | GGML_ASSERT(w.idx < files.size()); |
912 | 0 | const auto & file = files.at(w.idx); |
913 | 0 | file->seek(w.offs, SEEK_SET); |
914 | 0 | file->read_raw(cur->data, ggml_nbytes(cur)); |
915 | 0 | } |
916 | |
|
917 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { |
918 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
919 | 0 | } |
920 | 0 | } |
921 | | |
922 | | bool llama_model_loader::load_all_data( |
923 | | struct ggml_context * ctx, |
924 | | llama_buf_map & bufs, |
925 | | llama_mlocks * lmlocks, |
926 | | llama_progress_callback progress_callback, |
927 | 0 | void * progress_callback_user_data) { |
928 | 0 | GGML_ASSERT(size_data != 0 && "call init_mappings() first"); |
929 | |
|
930 | 0 | std::vector<no_init<uint8_t>> read_buf; |
931 | 0 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
932 | | |
933 | | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
934 | | // NVMe raid configurations might require more / larger buffers. |
935 | 0 | constexpr size_t n_buffers = 4; |
936 | 0 | constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB |
937 | |
|
938 | 0 | std::vector<ggml_backend_buffer_t> host_buffers; |
939 | 0 | std::vector<ggml_backend_event_t> events; |
940 | 0 | std::vector<void *> host_ptrs; |
941 | 0 | size_t buffer_idx = 0; // buffer to use for async loads |
942 | 0 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
943 | 0 | if (use_mmap || check_tensors) { |
944 | 0 | return nullptr; |
945 | 0 | } |
946 | | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
947 | | // First determine if the backend supports the necessary features for async uploads. |
948 | 0 | auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; |
949 | 0 | if (!buf) { |
950 | 0 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); |
951 | 0 | return nullptr; |
952 | 0 | } |
953 | | |
954 | 0 | auto * buft = ggml_backend_buffer_get_type(buf); |
955 | 0 | auto * dev = ggml_backend_buft_get_device(buft); |
956 | 0 | if (!dev) { |
957 | 0 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, |
958 | 0 | ggml_backend_buft_name(buft)); |
959 | 0 | return nullptr; |
960 | 0 | } |
961 | | |
962 | 0 | if (buft != ggml_backend_dev_buffer_type(dev)) { |
963 | 0 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, |
964 | 0 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
965 | 0 | return nullptr; |
966 | 0 | } |
967 | | |
968 | 0 | ggml_backend_dev_props props; |
969 | 0 | ggml_backend_dev_get_props(dev, &props); |
970 | 0 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
971 | 0 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, |
972 | 0 | ggml_backend_dev_name(dev)); |
973 | 0 | return nullptr; |
974 | 0 | } |
975 | | |
976 | 0 | auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
977 | 0 | if (!host_buft) { |
978 | 0 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, |
979 | 0 | ggml_backend_dev_name(dev)); |
980 | 0 | return nullptr; |
981 | 0 | } |
982 | | |
983 | | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
984 | 0 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
985 | 0 | auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); |
986 | 0 | if (!buf) { |
987 | 0 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, |
988 | 0 | ggml_backend_dev_name(dev)); |
989 | 0 | return nullptr; |
990 | 0 | } |
991 | | |
992 | 0 | host_buffers.emplace_back(buf); |
993 | 0 | host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); |
994 | |
|
995 | 0 | auto * event = ggml_backend_event_new(dev); |
996 | 0 | if (!event) { |
997 | 0 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, |
998 | 0 | ggml_backend_dev_name(dev)); |
999 | 0 | return nullptr; |
1000 | 0 | } |
1001 | | |
1002 | 0 | events.emplace_back(event); |
1003 | 0 | } |
1004 | | |
1005 | 0 | ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
1006 | 0 | if (!backend) { |
1007 | 0 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, |
1008 | 0 | ggml_backend_dev_name(dev)); |
1009 | 0 | return nullptr; |
1010 | 0 | } |
1011 | | |
1012 | 0 | return backend; |
1013 | 0 | }(__func__); |
1014 | |
|
1015 | 0 | if (upload_backend) { |
1016 | 0 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, |
1017 | 0 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
1018 | 0 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
1019 | 0 | ggml_backend_name(upload_backend)); |
1020 | 0 | } |
1021 | |
|
1022 | 0 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { |
1023 | 0 | const auto * weight = get_weight(ggml_get_name(cur)); |
1024 | 0 | if (weight == nullptr) { |
1025 | | // this can happen with split experts models |
1026 | 0 | continue; |
1027 | 0 | } |
1028 | | |
1029 | 0 | if (progress_callback) { |
1030 | 0 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
1031 | 0 | return false; |
1032 | 0 | } |
1033 | 0 | } |
1034 | | |
1035 | 0 | size_t n_size = ggml_nbytes(cur); |
1036 | |
|
1037 | 0 | if (use_mmap) { |
1038 | 0 | const auto & mapping = mappings.at(weight->idx); |
1039 | 0 | ggml_backend_buffer_t buf_mmap = nullptr; |
1040 | 0 | if (bufs.count(weight->idx)) { |
1041 | 0 | buf_mmap = bufs.at(weight->idx); |
1042 | 0 | } |
1043 | 0 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
1044 | |
|
1045 | 0 | if (check_tensors) { |
1046 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { |
1047 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); |
1048 | 0 | })); |
1049 | 0 | } |
1050 | |
|
1051 | 0 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
1052 | 0 | if (buf_mmap && cur->data == nullptr) { |
1053 | 0 | ggml_backend_tensor_alloc(buf_mmap, cur, data); |
1054 | 0 | if (lmlocks) { |
1055 | 0 | const auto & lmlock = lmlocks->at(weight->idx); |
1056 | 0 | lmlock->grow_to(weight->offs + n_size); |
1057 | 0 | } |
1058 | |
|
1059 | 0 | auto & mmap_used = mmaps_used[weight->idx]; |
1060 | 0 | mmap_used.first = std::min(mmap_used.first, weight->offs); |
1061 | 0 | mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); |
1062 | 0 | } else { |
1063 | 0 | ggml_backend_tensor_set(cur, data, 0, n_size); |
1064 | 0 | } |
1065 | 0 | } else { |
1066 | 0 | const auto & file = files.at(weight->idx); |
1067 | 0 | if (ggml_backend_buffer_is_host(cur->buffer)) { |
1068 | 0 | file->seek(weight->offs, SEEK_SET); |
1069 | 0 | file->read_raw(cur->data, n_size); |
1070 | 0 | if (check_tensors) { |
1071 | 0 | validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { |
1072 | 0 | return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); |
1073 | 0 | })); |
1074 | 0 | } |
1075 | 0 | } else { |
1076 | | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
1077 | 0 | if (upload_backend) { |
1078 | 0 | file->seek(weight->offs, SEEK_SET); |
1079 | |
|
1080 | 0 | size_t bytes_read = 0; |
1081 | |
|
1082 | 0 | while (bytes_read < n_size) { |
1083 | 0 | size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read); |
1084 | |
|
1085 | 0 | ggml_backend_event_synchronize(events[buffer_idx]); |
1086 | 0 | file->read_raw(host_ptrs[buffer_idx], read_iteration); |
1087 | 0 | ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); |
1088 | 0 | ggml_backend_event_record(events[buffer_idx], upload_backend); |
1089 | |
|
1090 | 0 | bytes_read += read_iteration; |
1091 | 0 | ++buffer_idx; |
1092 | 0 | buffer_idx %= n_buffers; |
1093 | 0 | } |
1094 | 0 | } else { |
1095 | 0 | read_buf.resize(n_size); |
1096 | 0 | file->seek(weight->offs, SEEK_SET); |
1097 | 0 | file->read_raw(read_buf.data(), n_size); |
1098 | 0 | ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); |
1099 | 0 | if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { |
1100 | 0 | throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); |
1101 | 0 | } |
1102 | 0 | } |
1103 | 0 | } |
1104 | 0 | } |
1105 | | |
1106 | 0 | size_done += n_size; |
1107 | 0 | } |
1108 | | |
1109 | | // free temporary resources used for async uploads |
1110 | 0 | for (auto * event : events) { |
1111 | 0 | ggml_backend_event_synchronize(event); |
1112 | 0 | ggml_backend_event_free(event); |
1113 | 0 | } |
1114 | 0 | for (auto * buf : host_buffers) { |
1115 | 0 | ggml_backend_buffer_free(buf); |
1116 | 0 | } |
1117 | 0 | ggml_backend_free(upload_backend); |
1118 | | |
1119 | | // check validation results |
1120 | 0 | bool validation_failed = false; |
1121 | 0 | for (auto & future : validation_result) { |
1122 | 0 | auto result = future.get(); |
1123 | 0 | if (!result.second) { |
1124 | 0 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); |
1125 | 0 | validation_failed = true; |
1126 | 0 | } |
1127 | 0 | } |
1128 | 0 | if (validation_failed) { |
1129 | 0 | throw std::runtime_error("found tensors with invalid data"); |
1130 | 0 | } |
1131 | | |
1132 | | // check if this is the last call and do final cleanup |
1133 | 0 | if (size_done >= size_data) { |
1134 | | // unmap offloaded tensors and metadata |
1135 | 0 | if (use_mmap) { |
1136 | 0 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
1137 | 0 | const auto & mmap_used = mmaps_used.at(idx); |
1138 | 0 | auto & mapping = mappings.at(idx); |
1139 | 0 | mapping->unmap_fragment(0, mmap_used.first); |
1140 | 0 | if (mmap_used.second != 0) { |
1141 | 0 | mapping->unmap_fragment(mmap_used.second, mapping->size()); |
1142 | 0 | } |
1143 | 0 | } |
1144 | 0 | } |
1145 | 0 | if (progress_callback) { |
1146 | | // Even though the model is done loading, we still honor |
1147 | | // cancellation since we need to free allocations. |
1148 | 0 | return progress_callback(1.0f, progress_callback_user_data); |
1149 | 0 | } |
1150 | 0 | } |
1151 | | |
1152 | 0 | return true; |
1153 | 0 | } |
1154 | | |
1155 | 0 | std::string llama_model_loader::ftype_name() const { |
1156 | 0 | return llama_model_ftype_name(ftype); |
1157 | 0 | } |
1158 | | |
1159 | 157 | void llama_model_loader::print_info() const { |
1160 | 157 | LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); |
1161 | 157 | LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); |
1162 | 157 | if (n_bytes < GiB) { |
1163 | 157 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
1164 | 157 | } else { |
1165 | 0 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
1166 | 0 | } |
1167 | 157 | } |