/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp
Line | Count | Source |
1 | | #include "ggml-backend.h" |
2 | | #include "ggml-backend-impl.h" |
3 | | #include "ggml-cpu.h" |
4 | | #include "repack.h" |
5 | | #include "traits.h" |
6 | | #include "ggml-impl.h" |
7 | | #include "amx/amx.h" |
8 | | |
9 | | #include <cctype> |
10 | | #include <string> |
11 | | #include <vector> |
12 | | |
13 | | #ifdef GGML_USE_CPU_HBM |
14 | | # include "hbm.h" |
15 | | #endif |
16 | | |
17 | | #ifdef GGML_USE_CPU_KLEIDIAI |
18 | | # include "kleidiai/kleidiai.h" |
19 | | #endif |
20 | | |
21 | | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
22 | | # include "spacemit/ime.h" |
23 | | #endif |
24 | | |
25 | | #if defined(_WIN32) |
26 | | # define WIN32_LEAN_AND_MEAN |
27 | | # ifndef NOMINMAX |
28 | | # define NOMINMAX |
29 | | # endif |
30 | | # include <windows.h> |
31 | | #else |
32 | | # include <unistd.h> |
33 | | #endif |
34 | | |
35 | | #if defined(__APPLE__) |
36 | | # include <sys/sysctl.h> |
37 | | # include <sys/types.h> |
38 | | #endif |
39 | | |
40 | | // ggml-backend interface |
41 | | |
42 | 0 | std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() { |
43 | 0 | static std::vector<ggml_backend_buffer_type_t> bufts = []() { |
44 | 0 | std::vector<ggml_backend_buffer_type_t> bufts; |
45 | |
|
46 | | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) |
47 | | if (ggml_backend_amx_buffer_type()) { |
48 | | bufts.push_back(ggml_backend_amx_buffer_type()); |
49 | | } |
50 | | #endif |
51 | |
|
52 | | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
53 | | if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) { |
54 | | bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type()); |
55 | | } |
56 | | #endif |
57 | |
|
58 | | #ifdef GGML_USE_CPU_KLEIDIAI |
59 | | if (ggml_backend_cpu_kleidiai_buffer_type()) { |
60 | | bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type()); |
61 | | } |
62 | | #endif |
63 | |
|
64 | 0 | #ifdef GGML_USE_CPU_REPACK |
65 | 0 | if (ggml_backend_cpu_repack_buffer_type()) { |
66 | 0 | bufts.push_back(ggml_backend_cpu_repack_buffer_type()); |
67 | 0 | } |
68 | 0 | #endif |
69 | |
|
70 | 0 | return bufts; |
71 | 0 | }(); |
72 | |
|
73 | 0 | return bufts; |
74 | 0 | } |
75 | | |
76 | 0 | static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { |
77 | 0 | static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] { |
78 | 0 | std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types(); |
79 | 0 | bufts.push_back(nullptr); |
80 | 0 | return bufts; |
81 | 0 | }(); |
82 | |
|
83 | 0 | return extra_bufts.data(); |
84 | | |
85 | 0 | GGML_UNUSED(device); |
86 | 0 | } |
87 | | |
88 | 0 | static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { |
89 | 0 | for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) { |
90 | 0 | if (extra == buft) { |
91 | 0 | return true; |
92 | 0 | } |
93 | 0 | } |
94 | 0 | return false; |
95 | 0 | } |
96 | | |
97 | | // CPU backend - backend (stream) |
98 | | |
99 | | struct ggml_backend_cpu_context { |
100 | | int n_threads; |
101 | | ggml_threadpool_t threadpool; |
102 | | |
103 | | uint8_t * work_data; |
104 | | size_t work_size; |
105 | | |
106 | | ggml_abort_callback abort_callback; |
107 | | void * abort_callback_data; |
108 | | }; |
109 | | |
110 | 0 | static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { |
111 | 0 | return "CPU"; |
112 | | |
113 | 0 | GGML_UNUSED(backend); |
114 | 0 | } |
115 | | |
116 | 0 | static void ggml_backend_cpu_free(ggml_backend_t backend) { |
117 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
118 | 0 | delete[] cpu_ctx->work_data; |
119 | 0 | delete cpu_ctx; |
120 | 0 | delete backend; |
121 | 0 | } |
122 | | |
123 | | struct ggml_backend_plan_cpu { |
124 | | struct ggml_cplan cplan; |
125 | | struct ggml_cgraph cgraph; |
126 | | }; |
127 | | |
128 | 0 | static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { |
129 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
130 | |
|
131 | 0 | struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; |
132 | |
|
133 | 0 | cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); |
134 | 0 | cpu_plan->cgraph = *cgraph; // FIXME: deep copy |
135 | |
|
136 | 0 | if (cpu_plan->cplan.work_size > 0) { |
137 | 0 | cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; |
138 | 0 | if (cpu_plan->cplan.work_data == NULL) { |
139 | 0 | delete cpu_plan; |
140 | 0 | return NULL; |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | 0 | cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; |
145 | 0 | cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
146 | |
|
147 | 0 | return cpu_plan; |
148 | 0 | } |
149 | | |
150 | 0 | static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
151 | 0 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
152 | |
|
153 | 0 | delete[] cpu_plan->cplan.work_data; |
154 | 0 | delete cpu_plan; |
155 | |
|
156 | 0 | GGML_UNUSED(backend); |
157 | 0 | } |
158 | | |
159 | 0 | static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
160 | 0 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
161 | |
|
162 | 0 | return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); |
163 | | |
164 | 0 | GGML_UNUSED(backend); |
165 | 0 | } |
166 | | |
167 | 0 | static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
168 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
169 | |
|
170 | 0 | struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); |
171 | |
|
172 | 0 | if (cpu_ctx->work_size < cplan.work_size) { |
173 | 0 | delete[] cpu_ctx->work_data; |
174 | 0 | cpu_ctx->work_data = new uint8_t[cplan.work_size]; |
175 | 0 | if (cpu_ctx->work_data == NULL) { |
176 | 0 | cpu_ctx->work_size = 0; |
177 | 0 | return GGML_STATUS_ALLOC_FAILED; |
178 | 0 | } |
179 | 0 | cpu_ctx->work_size = cplan.work_size; |
180 | 0 | } |
181 | 0 | cplan.work_data = (uint8_t *)cpu_ctx->work_data; |
182 | |
|
183 | 0 | cplan.abort_callback = cpu_ctx->abort_callback; |
184 | 0 | cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
185 | |
|
186 | 0 | return ggml_graph_compute(cgraph, &cplan); |
187 | 0 | } |
188 | | |
189 | | static const struct ggml_backend_i ggml_backend_cpu_i = { |
190 | | /* .get_name = */ ggml_backend_cpu_get_name, |
191 | | /* .free = */ ggml_backend_cpu_free, |
192 | | /* .set_tensor_async = */ NULL, |
193 | | /* .get_tensor_async = */ NULL, |
194 | | /* .cpy_tensor_async = */ NULL, |
195 | | /* .synchronize = */ NULL, |
196 | | /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, |
197 | | /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, |
198 | | /* .graph_plan_update = */ NULL, |
199 | | /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, |
200 | | /* .graph_compute = */ ggml_backend_cpu_graph_compute, |
201 | | /* .event_record = */ NULL, |
202 | | /* .event_wait = */ NULL, |
203 | | /* .graph_optimize = */ NULL, |
204 | | }; |
205 | | |
206 | 0 | static ggml_guid_t ggml_backend_cpu_guid(void) { |
207 | 0 | static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; |
208 | 0 | return &guid; |
209 | 0 | } |
210 | | |
211 | 0 | ggml_backend_t ggml_backend_cpu_init(void) { |
212 | | // initialize CPU backend now to avoid slowing the first graph computation |
213 | 0 | ggml_cpu_init(); |
214 | |
|
215 | 0 | struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; |
216 | 0 | if (ctx == NULL) { |
217 | 0 | return NULL; |
218 | 0 | } |
219 | | |
220 | 0 | ctx->n_threads = GGML_DEFAULT_N_THREADS; |
221 | 0 | ctx->threadpool = NULL; |
222 | 0 | ctx->work_data = NULL; |
223 | 0 | ctx->work_size = 0; |
224 | 0 | ctx->abort_callback = NULL; |
225 | 0 | ctx->abort_callback_data = NULL; |
226 | |
|
227 | 0 | ggml_backend_t cpu_backend = new ggml_backend { |
228 | 0 | /* .guid = */ ggml_backend_cpu_guid(), |
229 | 0 | /* .iface = */ ggml_backend_cpu_i, |
230 | 0 | /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
231 | 0 | /* .context = */ ctx, |
232 | 0 | }; |
233 | |
|
234 | 0 | if (cpu_backend == NULL) { |
235 | 0 | delete ctx; |
236 | 0 | return NULL; |
237 | 0 | } |
238 | | |
239 | 0 | return cpu_backend; |
240 | 0 | } |
241 | | |
242 | 0 | bool ggml_backend_is_cpu(ggml_backend_t backend) { |
243 | 0 | return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); |
244 | 0 | } |
245 | | |
246 | 0 | void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { |
247 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
248 | |
|
249 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
250 | 0 | ctx->n_threads = n_threads; |
251 | 0 | } |
252 | | |
253 | 0 | void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { |
254 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
255 | |
|
256 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
257 | |
|
258 | 0 | if (ctx->threadpool && ctx->threadpool != threadpool) { |
259 | | // already had a different threadpool, pause/suspend it before switching |
260 | 0 | ggml_threadpool_pause(ctx->threadpool); |
261 | 0 | } |
262 | 0 | ctx->threadpool = threadpool; |
263 | 0 | } |
264 | | |
265 | 0 | void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { |
266 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
267 | |
|
268 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
269 | 0 | ctx->abort_callback = abort_callback; |
270 | 0 | ctx->abort_callback_data = abort_callback_data; |
271 | 0 | } |
272 | | |
273 | | // CPU backend - device |
274 | | |
275 | | struct ggml_backend_cpu_device_context { |
276 | | std::string description = "CPU"; |
277 | | |
278 | 3 | ggml_backend_cpu_device_context() { |
279 | | #ifdef __APPLE__ |
280 | | size_t len = 0; |
281 | | if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { |
282 | | description.resize(len); |
283 | | sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT |
284 | | } |
285 | | #elif defined(__linux__) |
286 | | FILE * f = fopen("/proc/cpuinfo", "r"); |
287 | 3 | if (f) { |
288 | 3 | char buf[1024]; |
289 | 15 | while (fgets(buf, sizeof(buf), f)) { |
290 | 15 | if (strncmp(buf, "model name", 10) == 0) { |
291 | 3 | char * p = strchr(buf, ':'); |
292 | 3 | if (p) { |
293 | 3 | p++; |
294 | 6 | while (std::isspace(*p)) { |
295 | 3 | p++; |
296 | 3 | } |
297 | 6 | while (std::isspace(p[strlen(p) - 1])) { |
298 | 3 | p[strlen(p) - 1] = '\0'; |
299 | 3 | } |
300 | 3 | description = p; |
301 | 3 | break; |
302 | 3 | } |
303 | 3 | } |
304 | 15 | } |
305 | 3 | fclose(f); |
306 | 3 | } |
307 | | #elif defined(_WIN32) |
308 | | HKEY hKey; |
309 | | if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, |
310 | | TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), |
311 | | 0, |
312 | | KEY_READ, |
313 | | &hKey) == ERROR_SUCCESS) { |
314 | | DWORD cpu_brand_size = 0; |
315 | | if (RegQueryValueExA(hKey, |
316 | | "ProcessorNameString", |
317 | | NULL, |
318 | | NULL, |
319 | | NULL, |
320 | | &cpu_brand_size) == ERROR_SUCCESS) { |
321 | | description.resize(cpu_brand_size); |
322 | | if (RegQueryValueExA(hKey, |
323 | | "ProcessorNameString", |
324 | | NULL, |
325 | | NULL, |
326 | | (LPBYTE)&description[0], // NOLINT |
327 | | &cpu_brand_size) == ERROR_SUCCESS) { |
328 | | if (description.find('\0') != std::string::npos) { |
329 | | description.resize(description.find('\0')); |
330 | | } |
331 | | } |
332 | | } |
333 | | RegCloseKey(hKey); |
334 | | } |
335 | | #endif |
336 | 3 | } |
337 | | }; |
338 | | |
339 | 0 | static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { |
340 | 0 | return "CPU"; |
341 | | |
342 | 0 | GGML_UNUSED(dev); |
343 | 0 | } |
344 | | |
345 | 0 | static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { |
346 | 0 | struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; |
347 | |
|
348 | 0 | return ctx->description.c_str(); |
349 | 0 | } |
350 | | |
351 | 0 | static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |
352 | | #ifdef _WIN32 |
353 | | MEMORYSTATUSEX status; |
354 | | status.dwLength = sizeof(status); |
355 | | GlobalMemoryStatusEx(&status); |
356 | | *total = status.ullTotalPhys; |
357 | | *free = status.ullAvailPhys; |
358 | | #else |
359 | 0 | long pages = sysconf(_SC_PHYS_PAGES); |
360 | 0 | long page_size = sysconf(_SC_PAGE_SIZE); |
361 | 0 | *total = pages * page_size; |
362 | | |
363 | | // "free" system memory is ill-defined, for practical purposes assume that all of it is free: |
364 | 0 | *free = *total; |
365 | 0 | #endif // _WIN32 |
366 | |
|
367 | 0 | GGML_UNUSED(dev); |
368 | 0 | } |
369 | | |
370 | 872 | static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { |
371 | 872 | return GGML_BACKEND_DEVICE_TYPE_CPU; |
372 | | |
373 | 0 | GGML_UNUSED(dev); |
374 | 0 | } |
375 | | |
376 | 0 | static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |
377 | 0 | props->name = ggml_backend_cpu_device_get_name(dev); |
378 | 0 | props->description = ggml_backend_cpu_device_get_description(dev); |
379 | 0 | props->type = ggml_backend_cpu_device_get_type(dev); |
380 | 0 | ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); |
381 | 0 | props->caps = { |
382 | 0 | /* .async = */ false, |
383 | 0 | /* .host_buffer = */ false, |
384 | 0 | /* .buffer_from_host_ptr = */ true, |
385 | 0 | /* .events = */ false, |
386 | 0 | }; |
387 | 0 | } |
388 | | |
389 | 0 | static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { |
390 | 0 | return ggml_backend_cpu_init(); |
391 | | |
392 | 0 | GGML_UNUSED(dev); |
393 | 0 | GGML_UNUSED(params); |
394 | 0 | } |
395 | | |
396 | 0 | static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { |
397 | 0 | return ggml_backend_cpu_buffer_type(); |
398 | | |
399 | 0 | GGML_UNUSED(dev); |
400 | 0 | } |
401 | | |
402 | 0 | static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { |
403 | 0 | return ggml_backend_cpu_buffer_from_ptr(ptr, size); |
404 | | |
405 | 0 | GGML_UNUSED(dev); |
406 | 0 | GGML_UNUSED(max_tensor_size); |
407 | 0 | } |
408 | | |
409 | 0 | static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
410 | 0 | const struct ggml_tensor * src0 = op->src[0]; |
411 | 0 | const struct ggml_tensor * src1 = op->src[1]; |
412 | |
|
413 | 0 | if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) { |
414 | 0 | return true; |
415 | 0 | } |
416 | | |
417 | | // check extra buffer types |
418 | | // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary |
419 | 0 | for (int i = 0; i < 4; i++) { |
420 | 0 | if (op->src[i] && op->src[i]->buffer && |
421 | 0 | ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) { |
422 | 0 | auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context; |
423 | 0 | return buf_extra->supports_op(dev, op); |
424 | 0 | } |
425 | 0 | } |
426 | | |
427 | 0 | switch (op->op) { |
428 | 0 | case GGML_OP_CPY: |
429 | 0 | case GGML_OP_SET_ROWS: |
430 | 0 | return |
431 | 0 | op->type != GGML_TYPE_IQ3_XXS && |
432 | 0 | op->type != GGML_TYPE_IQ3_S && |
433 | 0 | op->type != GGML_TYPE_IQ2_XXS && |
434 | 0 | op->type != GGML_TYPE_IQ2_XS && |
435 | 0 | op->type != GGML_TYPE_IQ2_S && |
436 | 0 | op->type != GGML_TYPE_IQ1_S && |
437 | 0 | op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float |
438 | 0 | case GGML_OP_MUL_MAT: |
439 | 0 | return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; |
440 | 0 | case GGML_OP_SOFT_MAX_BACK: { |
441 | 0 | if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) { |
442 | 0 | return false; |
443 | 0 | } |
444 | 0 | float max_bias = 0.0f; |
445 | |
|
446 | 0 | memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); |
447 | |
|
448 | 0 | return max_bias == 0.0f; |
449 | 0 | } |
450 | 0 | case GGML_OP_IM2COL_BACK: |
451 | 0 | return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; |
452 | 0 | case GGML_OP_GET_ROWS_BACK: |
453 | 0 | return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16; |
454 | 0 | case GGML_OP_OUT_PROD: |
455 | 0 | return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) && |
456 | 0 | src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; |
457 | 0 | default: |
458 | 0 | return true; |
459 | 0 | } |
460 | 0 | } |
461 | | |
462 | 0 | static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |
463 | 0 | return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft); |
464 | 0 | GGML_UNUSED(dev); |
465 | 0 | } |
466 | | |
467 | | static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { |
468 | | /* .get_name = */ ggml_backend_cpu_device_get_name, |
469 | | /* .get_description = */ ggml_backend_cpu_device_get_description, |
470 | | /* .get_memory = */ ggml_backend_cpu_device_get_memory, |
471 | | /* .get_type = */ ggml_backend_cpu_device_get_type, |
472 | | /* .get_props = */ ggml_backend_cpu_device_get_props, |
473 | | /* .init_backend = */ ggml_backend_cpu_device_init_backend, |
474 | | /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, |
475 | | /* .get_host_buffer_type = */ NULL, |
476 | | /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, |
477 | | /* .supports_op = */ ggml_backend_cpu_device_supports_op, |
478 | | /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, |
479 | | /* .offload_op = */ NULL, |
480 | | /* .event_new = */ NULL, |
481 | | /* .event_free = */ NULL, |
482 | | /* .event_synchronize = */ NULL, |
483 | | }; |
484 | | |
485 | | // CPU backend - backend (reg) |
486 | | |
487 | 0 | static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { |
488 | 0 | return "CPU"; |
489 | | |
490 | 0 | GGML_UNUSED(reg); |
491 | 0 | } |
492 | | |
493 | 6 | static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { |
494 | 6 | return 1; |
495 | | |
496 | 0 | GGML_UNUSED(reg); |
497 | 0 | } |
498 | | |
499 | 3 | static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { |
500 | 3 | GGML_ASSERT(index == 0); |
501 | | |
502 | 3 | static ggml_backend_cpu_device_context ctx; |
503 | 3 | static ggml_backend_device ggml_backend_cpu_device = { |
504 | 3 | /* .iface = */ ggml_backend_cpu_device_i, |
505 | 3 | /* .reg = */ reg, |
506 | 3 | /* .context = */ &ctx, |
507 | 3 | }; |
508 | | |
509 | 3 | return &ggml_backend_cpu_device; |
510 | 3 | } |
511 | | |
512 | | // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, |
513 | | // and additionally to allow other backends to expose their own list of features that applications can query using the same API |
514 | 0 | static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { |
515 | 0 | static std::vector<ggml_backend_feature> features = []() { |
516 | 0 | ggml_cpu_init(); |
517 | |
|
518 | 0 | std::vector<ggml_backend_feature> features; |
519 | 0 | if (ggml_cpu_has_sse3()) { |
520 | 0 | features.push_back({ "SSE3", "1" }); |
521 | 0 | } |
522 | 0 | if (ggml_cpu_has_ssse3()) { |
523 | 0 | features.push_back({ "SSSE3", "1" }); |
524 | 0 | } |
525 | 0 | if (ggml_cpu_has_avx()) { |
526 | 0 | features.push_back({ "AVX", "1" }); |
527 | 0 | } |
528 | 0 | if (ggml_cpu_has_avx_vnni()) { |
529 | 0 | features.push_back({ "AVX_VNNI", "1" }); |
530 | 0 | } |
531 | 0 | if (ggml_cpu_has_avx2()) { |
532 | 0 | features.push_back({ "AVX2", "1" }); |
533 | 0 | } |
534 | 0 | if (ggml_cpu_has_f16c()) { |
535 | 0 | features.push_back({ "F16C", "1" }); |
536 | 0 | } |
537 | 0 | if (ggml_cpu_has_fma()) { |
538 | 0 | features.push_back({ "FMA", "1" }); |
539 | 0 | } |
540 | 0 | if (ggml_cpu_has_bmi2()) { |
541 | 0 | features.push_back({ "BMI2", "1" }); |
542 | 0 | } |
543 | 0 | if (ggml_cpu_has_avx512()) { |
544 | 0 | features.push_back({ "AVX512", "1" }); |
545 | 0 | } |
546 | 0 | if (ggml_cpu_has_avx512_vbmi()) { |
547 | 0 | features.push_back({ "AVX512_VBMI", "1" }); |
548 | 0 | } |
549 | 0 | if (ggml_cpu_has_avx512_vnni()) { |
550 | 0 | features.push_back({ "AVX512_VNNI", "1" }); |
551 | 0 | } |
552 | 0 | if (ggml_cpu_has_avx512_bf16()) { |
553 | 0 | features.push_back({ "AVX512_BF16", "1" }); |
554 | 0 | } |
555 | 0 | if (ggml_cpu_has_amx_int8()) { |
556 | 0 | features.push_back({ "AMX_INT8", "1" }); |
557 | 0 | } |
558 | 0 | if (ggml_cpu_has_neon()) { |
559 | 0 | features.push_back({ "NEON", "1" }); |
560 | 0 | } |
561 | 0 | if (ggml_cpu_has_arm_fma()) { |
562 | 0 | features.push_back({ "ARM_FMA", "1" }); |
563 | 0 | } |
564 | 0 | if (ggml_cpu_has_fp16_va()) { |
565 | 0 | features.push_back({ "FP16_VA", "1" }); |
566 | 0 | } |
567 | 0 | if (ggml_cpu_has_matmul_int8()) { |
568 | 0 | features.push_back({ "MATMUL_INT8", "1" }); |
569 | 0 | } |
570 | 0 | if (ggml_cpu_has_sve()) { |
571 | 0 | features.push_back({ "SVE", "1" }); |
572 | 0 | } |
573 | 0 | if (ggml_cpu_has_dotprod()) { |
574 | 0 | features.push_back({ "DOTPROD", "1" }); |
575 | 0 | } |
576 | 0 | if (ggml_cpu_get_sve_cnt() > 0) { |
577 | 0 | static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt()); |
578 | 0 | features.push_back({ "SVE_CNT", sve_cnt.c_str() }); |
579 | 0 | } |
580 | 0 | if (ggml_cpu_has_sme()) { |
581 | 0 | features.push_back({ "SME", "1" }); |
582 | 0 | } |
583 | 0 | if (ggml_cpu_has_riscv_v()) { |
584 | 0 | features.push_back({ "RISCV_V", "1" }); |
585 | 0 | } |
586 | 0 | if (ggml_cpu_has_vsx()) { |
587 | 0 | features.push_back({ "VSX", "1" }); |
588 | 0 | } |
589 | 0 | if (ggml_cpu_has_vxe()) { |
590 | 0 | features.push_back({ "VXE", "1" }); |
591 | 0 | } |
592 | 0 | if (ggml_cpu_has_wasm_simd()) { |
593 | 0 | features.push_back({ "WASM_SIMD", "1" }); |
594 | 0 | } |
595 | 0 | if (ggml_cpu_has_llamafile()) { |
596 | 0 | features.push_back({ "LLAMAFILE", "1" }); |
597 | 0 | } |
598 | | #ifdef GGML_USE_ACCELERATE |
599 | | features.push_back({ "ACCELERATE", "1" }); |
600 | | #endif |
601 | | #ifdef GGML_USE_CPU_HBM |
602 | | features.push_back({ "CPU_HBM", "1" }); |
603 | | #endif |
604 | | #ifdef GGML_USE_OPENMP |
605 | | features.push_back({ "OPENMP", "1" }); |
606 | | #endif |
607 | | #ifdef GGML_USE_CPU_KLEIDIAI |
608 | | features.push_back({ "KLEIDIAI", "1" }); |
609 | | #endif |
610 | 0 | #ifdef GGML_USE_CPU_REPACK |
611 | 0 | features.push_back({ "REPACK", "1" }); |
612 | 0 | #endif |
613 | |
|
614 | 0 | features.push_back({ nullptr, nullptr }); |
615 | |
|
616 | 0 | return features; |
617 | 0 | }(); |
618 | |
|
619 | 0 | return features.data(); |
620 | | |
621 | 0 | GGML_UNUSED(reg); |
622 | 0 | } |
623 | | |
624 | 0 | static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { |
625 | 0 | if (strcmp(name, "ggml_backend_set_n_threads") == 0) { |
626 | 0 | ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads; |
627 | 0 | return (void *)fct; |
628 | 0 | } |
629 | 0 | if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { |
630 | 0 | ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type; |
631 | 0 | return (void *)fct; |
632 | 0 | } |
633 | 0 | if (strcmp(name, "ggml_backend_get_features") == 0) { |
634 | 0 | return (void *)ggml_backend_cpu_get_features; |
635 | 0 | } |
636 | 0 | if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { |
637 | 0 | return (void *)ggml_backend_cpu_set_abort_callback; |
638 | 0 | } |
639 | 0 | if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { |
640 | 0 | return (void *)ggml_numa_init; |
641 | 0 | } |
642 | 0 | if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { |
643 | 0 | return (void *)ggml_is_numa; |
644 | 0 | } |
645 | | |
646 | | // threadpool - TODO: move to ggml-base |
647 | 0 | if (strcmp(name, "ggml_threadpool_new") == 0) { |
648 | 0 | return (void *)ggml_threadpool_new; |
649 | 0 | } |
650 | 0 | if (strcmp(name, "ggml_threadpool_free") == 0) { |
651 | 0 | return (void *)ggml_threadpool_free; |
652 | 0 | } |
653 | 0 | if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { |
654 | 0 | return (void *)ggml_backend_cpu_set_threadpool; |
655 | 0 | } |
656 | | |
657 | 0 | return NULL; |
658 | | |
659 | 0 | GGML_UNUSED(reg); |
660 | 0 | } |
661 | | |
662 | | static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { |
663 | | /* .get_name = */ ggml_backend_cpu_reg_get_name, |
664 | | /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, |
665 | | /* .get_device = */ ggml_backend_cpu_reg_get_device, |
666 | | /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, |
667 | | }; |
668 | | |
669 | 3 | ggml_backend_reg_t ggml_backend_cpu_reg(void) { |
670 | | // init CPU feature detection |
671 | 3 | ggml_cpu_init(); |
672 | | |
673 | 3 | static struct ggml_backend_reg ggml_backend_cpu_reg = { |
674 | 3 | /* .api_version = */ GGML_BACKEND_API_VERSION, |
675 | 3 | /* .iface = */ ggml_backend_cpu_reg_i, |
676 | 3 | /* .context = */ NULL, |
677 | 3 | }; |
678 | | |
679 | 3 | return &ggml_backend_cpu_reg; |
680 | 3 | } |
681 | | |
682 | | GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) |