/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp
Line | Count | Source |
1 | | #include "ggml-backend.h" |
2 | | #include "ggml-backend-impl.h" |
3 | | #include "ggml-cpu.h" |
4 | | #include "repack.h" |
5 | | #include "traits.h" |
6 | | #include "ggml-impl.h" |
7 | | #include "amx/amx.h" |
8 | | |
9 | | #include <cctype> |
10 | | #include <string> |
11 | | #include <vector> |
12 | | |
13 | | #ifdef GGML_USE_CPU_HBM |
14 | | # include "hbm.h" |
15 | | #endif |
16 | | |
17 | | #ifdef GGML_USE_CPU_KLEIDIAI |
18 | | # include "kleidiai/kleidiai.h" |
19 | | #endif |
20 | | |
21 | | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
22 | | # include "spacemit/ime.h" |
23 | | #endif |
24 | | |
25 | | #if defined(_WIN32) |
26 | | # define WIN32_LEAN_AND_MEAN |
27 | | # ifndef NOMINMAX |
28 | | # define NOMINMAX |
29 | | # endif |
30 | | # include <windows.h> |
31 | | #else |
32 | | # include <unistd.h> |
33 | | #endif |
34 | | |
35 | | #if defined(__APPLE__) |
36 | | # include <sys/sysctl.h> |
37 | | # include <sys/types.h> |
38 | | #endif |
39 | | |
40 | | // ggml-backend interface |
41 | | |
42 | 0 | std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() { |
43 | 0 | static std::vector<ggml_backend_buffer_type_t> bufts = []() { |
44 | 0 | std::vector<ggml_backend_buffer_type_t> bufts; |
45 | |
|
46 | | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) |
47 | | if (ggml_backend_amx_buffer_type()) { |
48 | | bufts.push_back(ggml_backend_amx_buffer_type()); |
49 | | } |
50 | | #endif |
51 | |
|
52 | | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
53 | | if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) { |
54 | | bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type()); |
55 | | } |
56 | | #endif |
57 | |
|
58 | | #ifdef GGML_USE_CPU_KLEIDIAI |
59 | | if (ggml_backend_cpu_kleidiai_buffer_type()) { |
60 | | bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type()); |
61 | | } |
62 | | #endif |
63 | |
|
64 | 0 | #ifdef GGML_USE_CPU_REPACK |
65 | 0 | if (ggml_backend_cpu_repack_buffer_type()) { |
66 | 0 | bufts.push_back(ggml_backend_cpu_repack_buffer_type()); |
67 | 0 | } |
68 | 0 | #endif |
69 | |
|
70 | 0 | return bufts; |
71 | 0 | }(); |
72 | |
|
73 | 0 | return bufts; |
74 | 0 | } |
75 | | |
76 | 0 | static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { |
77 | 0 | static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] { |
78 | 0 | std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types(); |
79 | 0 | bufts.push_back(nullptr); |
80 | 0 | return bufts; |
81 | 0 | }(); |
82 | |
|
83 | 0 | return extra_bufts.data(); |
84 | | |
85 | 0 | GGML_UNUSED(device); |
86 | 0 | } |
87 | | |
88 | 0 | static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { |
89 | 0 | for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) { |
90 | 0 | if (extra == buft) { |
91 | 0 | return true; |
92 | 0 | } |
93 | 0 | } |
94 | 0 | return false; |
95 | 0 | } |
96 | | |
97 | | // CPU backend - backend (stream) |
98 | | |
99 | | struct ggml_backend_cpu_context { |
100 | | int n_threads; |
101 | | ggml_threadpool_t threadpool; |
102 | | |
103 | | uint8_t * work_data; |
104 | | size_t work_size; |
105 | | |
106 | | ggml_abort_callback abort_callback; |
107 | | void * abort_callback_data; |
108 | | |
109 | | bool use_ref; // use reference implementation |
110 | | }; |
111 | | |
112 | 0 | static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { |
113 | 0 | return "CPU"; |
114 | | |
115 | 0 | GGML_UNUSED(backend); |
116 | 0 | } |
117 | | |
118 | 0 | static void ggml_backend_cpu_free(ggml_backend_t backend) { |
119 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
120 | 0 | delete[] cpu_ctx->work_data; |
121 | 0 | delete cpu_ctx; |
122 | 0 | delete backend; |
123 | 0 | } |
124 | | |
125 | | struct ggml_backend_plan_cpu { |
126 | | struct ggml_cplan cplan; |
127 | | struct ggml_cgraph cgraph; |
128 | | }; |
129 | | |
130 | 0 | static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { |
131 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
132 | |
|
133 | 0 | struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; |
134 | |
|
135 | 0 | cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); |
136 | 0 | cpu_plan->cgraph = *cgraph; // FIXME: deep copy |
137 | |
|
138 | 0 | if (cpu_plan->cplan.work_size > 0) { |
139 | 0 | cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; |
140 | 0 | if (cpu_plan->cplan.work_data == NULL) { |
141 | 0 | delete cpu_plan; |
142 | 0 | return NULL; |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | 0 | cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; |
147 | 0 | cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
148 | 0 | cpu_plan->cplan.use_ref = cpu_ctx->use_ref; |
149 | |
|
150 | 0 | return cpu_plan; |
151 | 0 | } |
152 | | |
153 | 0 | static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
154 | 0 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
155 | |
|
156 | 0 | delete[] cpu_plan->cplan.work_data; |
157 | 0 | delete cpu_plan; |
158 | |
|
159 | 0 | GGML_UNUSED(backend); |
160 | 0 | } |
161 | | |
162 | 0 | static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
163 | 0 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
164 | |
|
165 | 0 | return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); |
166 | | |
167 | 0 | GGML_UNUSED(backend); |
168 | 0 | } |
169 | | |
170 | 0 | static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
171 | 0 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
172 | |
|
173 | 0 | struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); |
174 | |
|
175 | 0 | if (cpu_ctx->work_size < cplan.work_size) { |
176 | 0 | delete[] cpu_ctx->work_data; |
177 | 0 | cpu_ctx->work_data = new uint8_t[cplan.work_size]; |
178 | 0 | if (cpu_ctx->work_data == NULL) { |
179 | 0 | cpu_ctx->work_size = 0; |
180 | 0 | return GGML_STATUS_ALLOC_FAILED; |
181 | 0 | } |
182 | 0 | cpu_ctx->work_size = cplan.work_size; |
183 | 0 | } |
184 | 0 | cplan.work_data = (uint8_t *)cpu_ctx->work_data; |
185 | |
|
186 | 0 | cplan.abort_callback = cpu_ctx->abort_callback; |
187 | 0 | cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
188 | 0 | cplan.use_ref = cpu_ctx->use_ref; |
189 | |
|
190 | 0 | return ggml_graph_compute(cgraph, &cplan); |
191 | 0 | } |
192 | | |
193 | | static const struct ggml_backend_i ggml_backend_cpu_i = { |
194 | | /* .get_name = */ ggml_backend_cpu_get_name, |
195 | | /* .free = */ ggml_backend_cpu_free, |
196 | | /* .set_tensor_async = */ NULL, |
197 | | /* .get_tensor_async = */ NULL, |
198 | | /* .cpy_tensor_async = */ NULL, |
199 | | /* .synchronize = */ NULL, |
200 | | /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, |
201 | | /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, |
202 | | /* .graph_plan_update = */ NULL, |
203 | | /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, |
204 | | /* .graph_compute = */ ggml_backend_cpu_graph_compute, |
205 | | /* .event_record = */ NULL, |
206 | | /* .event_wait = */ NULL, |
207 | | /* .graph_optimize = */ NULL, |
208 | | }; |
209 | | |
210 | 0 | static ggml_guid_t ggml_backend_cpu_guid(void) { |
211 | 0 | static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; |
212 | 0 | return &guid; |
213 | 0 | } |
214 | | |
215 | 0 | ggml_backend_t ggml_backend_cpu_init(void) { |
216 | | // initialize CPU backend now to avoid slowing the first graph computation |
217 | 0 | ggml_cpu_init(); |
218 | |
|
219 | 0 | struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; |
220 | 0 | if (ctx == NULL) { |
221 | 0 | return NULL; |
222 | 0 | } |
223 | | |
224 | 0 | ctx->n_threads = GGML_DEFAULT_N_THREADS; |
225 | 0 | ctx->threadpool = NULL; |
226 | 0 | ctx->work_data = NULL; |
227 | 0 | ctx->work_size = 0; |
228 | 0 | ctx->abort_callback = NULL; |
229 | 0 | ctx->abort_callback_data = NULL; |
230 | 0 | ctx->use_ref = false; |
231 | |
|
232 | 0 | ggml_backend_t cpu_backend = new ggml_backend { |
233 | 0 | /* .guid = */ ggml_backend_cpu_guid(), |
234 | 0 | /* .iface = */ ggml_backend_cpu_i, |
235 | 0 | /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
236 | 0 | /* .context = */ ctx, |
237 | 0 | }; |
238 | |
|
239 | 0 | if (cpu_backend == NULL) { |
240 | 0 | delete ctx; |
241 | 0 | return NULL; |
242 | 0 | } |
243 | | |
244 | 0 | return cpu_backend; |
245 | 0 | } |
246 | | |
247 | 0 | bool ggml_backend_is_cpu(ggml_backend_t backend) { |
248 | 0 | return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); |
249 | 0 | } |
250 | | |
251 | 0 | void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { |
252 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
253 | |
|
254 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
255 | 0 | ctx->n_threads = n_threads; |
256 | 0 | } |
257 | | |
258 | 0 | void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { |
259 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
260 | |
|
261 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
262 | |
|
263 | 0 | if (ctx->threadpool && ctx->threadpool != threadpool) { |
264 | | // already had a different threadpool, pause/suspend it before switching |
265 | 0 | ggml_threadpool_pause(ctx->threadpool); |
266 | 0 | } |
267 | 0 | ctx->threadpool = threadpool; |
268 | 0 | } |
269 | | |
270 | 0 | void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { |
271 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
272 | |
|
273 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
274 | 0 | ctx->abort_callback = abort_callback; |
275 | 0 | ctx->abort_callback_data = abort_callback_data; |
276 | 0 | } |
277 | | |
278 | 0 | void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) { |
279 | 0 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
280 | |
|
281 | 0 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
282 | 0 | ctx->use_ref = use_ref; |
283 | 0 | } |
284 | | |
285 | | // CPU backend - device |
286 | | |
287 | | struct ggml_backend_cpu_device_context { |
288 | | std::string description = "CPU"; |
289 | | |
290 | 1 | ggml_backend_cpu_device_context() { |
291 | | #ifdef __APPLE__ |
292 | | size_t len = 0; |
293 | | if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { |
294 | | description.resize(len); |
295 | | sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT |
296 | | } |
297 | | #elif defined(__linux__) |
298 | | FILE * f = fopen("/proc/cpuinfo", "r"); |
299 | 1 | if (f) { |
300 | 1 | char buf[1024]; |
301 | 5 | while (fgets(buf, sizeof(buf), f)) { |
302 | 5 | if (strncmp(buf, "model name", 10) == 0) { |
303 | 1 | char * p = strchr(buf, ':'); |
304 | 1 | if (p) { |
305 | 1 | p++; |
306 | 2 | while (std::isspace(*p)) { |
307 | 1 | p++; |
308 | 1 | } |
309 | 2 | while (std::isspace(p[strlen(p) - 1])) { |
310 | 1 | p[strlen(p) - 1] = '\0'; |
311 | 1 | } |
312 | 1 | description = p; |
313 | 1 | break; |
314 | 1 | } |
315 | 1 | } |
316 | 5 | } |
317 | 1 | fclose(f); |
318 | 1 | } |
319 | | #elif defined(_WIN32) |
320 | | HKEY hKey; |
321 | | if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, |
322 | | TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), |
323 | | 0, |
324 | | KEY_READ, |
325 | | &hKey) == ERROR_SUCCESS) { |
326 | | DWORD cpu_brand_size = 0; |
327 | | if (RegQueryValueExA(hKey, |
328 | | "ProcessorNameString", |
329 | | NULL, |
330 | | NULL, |
331 | | NULL, |
332 | | &cpu_brand_size) == ERROR_SUCCESS) { |
333 | | description.resize(cpu_brand_size); |
334 | | if (RegQueryValueExA(hKey, |
335 | | "ProcessorNameString", |
336 | | NULL, |
337 | | NULL, |
338 | | (LPBYTE)&description[0], // NOLINT |
339 | | &cpu_brand_size) == ERROR_SUCCESS) { |
340 | | if (description.find('\0') != std::string::npos) { |
341 | | description.resize(description.find('\0')); |
342 | | } |
343 | | } |
344 | | } |
345 | | RegCloseKey(hKey); |
346 | | } |
347 | | #endif |
348 | 1 | } |
349 | | }; |
350 | | |
351 | 0 | static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { |
352 | 0 | return "CPU"; |
353 | | |
354 | 0 | GGML_UNUSED(dev); |
355 | 0 | } |
356 | | |
357 | 0 | static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { |
358 | 0 | struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; |
359 | |
|
360 | 0 | return ctx->description.c_str(); |
361 | 0 | } |
362 | | |
363 | 0 | static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |
364 | | #ifdef _WIN32 |
365 | | MEMORYSTATUSEX status; |
366 | | status.dwLength = sizeof(status); |
367 | | GlobalMemoryStatusEx(&status); |
368 | | *total = status.ullTotalPhys; |
369 | | *free = status.ullAvailPhys; |
370 | | #else |
371 | 0 | long pages = sysconf(_SC_PHYS_PAGES); |
372 | 0 | long page_size = sysconf(_SC_PAGE_SIZE); |
373 | 0 | *total = pages * page_size; |
374 | | |
375 | | // "free" system memory is ill-defined, for practical purposes assume that all of it is free: |
376 | 0 | *free = *total; |
377 | 0 | #endif // _WIN32 |
378 | |
|
379 | 0 | GGML_UNUSED(dev); |
380 | 0 | } |
381 | | |
382 | 4.18k | static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { |
383 | 4.18k | return GGML_BACKEND_DEVICE_TYPE_CPU; |
384 | | |
385 | 0 | GGML_UNUSED(dev); |
386 | 0 | } |
387 | | |
388 | 0 | static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |
389 | 0 | props->name = ggml_backend_cpu_device_get_name(dev); |
390 | 0 | props->description = ggml_backend_cpu_device_get_description(dev); |
391 | 0 | props->type = ggml_backend_cpu_device_get_type(dev); |
392 | 0 | ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); |
393 | 0 | props->caps = { |
394 | 0 | /* .async = */ false, |
395 | 0 | /* .host_buffer = */ false, |
396 | 0 | /* .buffer_from_host_ptr = */ true, |
397 | 0 | /* .events = */ false, |
398 | 0 | }; |
399 | 0 | } |
400 | | |
401 | 0 | static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { |
402 | 0 | return ggml_backend_cpu_init(); |
403 | | |
404 | 0 | GGML_UNUSED(dev); |
405 | 0 | GGML_UNUSED(params); |
406 | 0 | } |
407 | | |
408 | 0 | static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { |
409 | 0 | return ggml_backend_cpu_buffer_type(); |
410 | | |
411 | 0 | GGML_UNUSED(dev); |
412 | 0 | } |
413 | | |
414 | 0 | static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { |
415 | 0 | return ggml_backend_cpu_buffer_from_ptr(ptr, size); |
416 | | |
417 | 0 | GGML_UNUSED(dev); |
418 | 0 | GGML_UNUSED(max_tensor_size); |
419 | 0 | } |
420 | | |
421 | 0 | static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
422 | 0 | const struct ggml_tensor * src0 = op->src[0]; |
423 | 0 | const struct ggml_tensor * src1 = op->src[1]; |
424 | |
|
425 | 0 | if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) { |
426 | 0 | return true; |
427 | 0 | } |
428 | | |
429 | | // check extra buffer types |
430 | | // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary |
431 | 0 | for (int i = 0; i < 4; i++) { |
432 | 0 | if (op->src[i] && op->src[i]->buffer && |
433 | 0 | ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) { |
434 | 0 | auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context; |
435 | 0 | return buf_extra->supports_op(dev, op); |
436 | 0 | } |
437 | 0 | } |
438 | | |
439 | 0 | switch (op->op) { |
440 | 0 | case GGML_OP_CPY: |
441 | 0 | case GGML_OP_SET_ROWS: |
442 | 0 | return |
443 | 0 | op->type != GGML_TYPE_IQ3_XXS && |
444 | 0 | op->type != GGML_TYPE_IQ3_S && |
445 | 0 | op->type != GGML_TYPE_IQ2_XXS && |
446 | 0 | op->type != GGML_TYPE_IQ2_XS && |
447 | 0 | op->type != GGML_TYPE_IQ2_S && |
448 | 0 | op->type != GGML_TYPE_IQ1_S && |
449 | 0 | op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float |
450 | 0 | case GGML_OP_MUL_MAT: |
451 | 0 | return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; |
452 | 0 | case GGML_OP_SOFT_MAX_BACK: { |
453 | 0 | if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) { |
454 | 0 | return false; |
455 | 0 | } |
456 | 0 | float max_bias = 0.0f; |
457 | |
|
458 | 0 | memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); |
459 | |
|
460 | 0 | return max_bias == 0.0f; |
461 | 0 | } |
462 | 0 | case GGML_OP_IM2COL_BACK: |
463 | 0 | return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; |
464 | 0 | case GGML_OP_GET_ROWS_BACK: |
465 | 0 | return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16; |
466 | 0 | case GGML_OP_OUT_PROD: |
467 | 0 | return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) && |
468 | 0 | src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; |
469 | 0 | default: |
470 | 0 | return true; |
471 | 0 | } |
472 | 0 | } |
473 | | |
474 | 0 | static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |
475 | 0 | return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft); |
476 | 0 | GGML_UNUSED(dev); |
477 | 0 | } |
478 | | |
479 | | static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { |
480 | | /* .get_name = */ ggml_backend_cpu_device_get_name, |
481 | | /* .get_description = */ ggml_backend_cpu_device_get_description, |
482 | | /* .get_memory = */ ggml_backend_cpu_device_get_memory, |
483 | | /* .get_type = */ ggml_backend_cpu_device_get_type, |
484 | | /* .get_props = */ ggml_backend_cpu_device_get_props, |
485 | | /* .init_backend = */ ggml_backend_cpu_device_init_backend, |
486 | | /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, |
487 | | /* .get_host_buffer_type = */ NULL, |
488 | | /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, |
489 | | /* .supports_op = */ ggml_backend_cpu_device_supports_op, |
490 | | /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, |
491 | | /* .offload_op = */ NULL, |
492 | | /* .event_new = */ NULL, |
493 | | /* .event_free = */ NULL, |
494 | | /* .event_synchronize = */ NULL, |
495 | | }; |
496 | | |
497 | | // CPU backend - backend (reg) |
498 | | |
499 | 0 | static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { |
500 | 0 | return "CPU"; |
501 | | |
502 | 0 | GGML_UNUSED(reg); |
503 | 0 | } |
504 | | |
505 | 2 | static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { |
506 | 2 | return 1; |
507 | | |
508 | 0 | GGML_UNUSED(reg); |
509 | 0 | } |
510 | | |
511 | 1 | static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { |
512 | 1 | GGML_ASSERT(index == 0); |
513 | | |
514 | 1 | static ggml_backend_cpu_device_context ctx; |
515 | 1 | static ggml_backend_device ggml_backend_cpu_device = { |
516 | 1 | /* .iface = */ ggml_backend_cpu_device_i, |
517 | 1 | /* .reg = */ reg, |
518 | 1 | /* .context = */ &ctx, |
519 | 1 | }; |
520 | | |
521 | 1 | return &ggml_backend_cpu_device; |
522 | 1 | } |
523 | | |
524 | | // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, |
525 | | // and additionally to allow other backends to expose their own list of features that applications can query using the same API |
526 | 0 | static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { |
527 | 0 | static std::vector<ggml_backend_feature> features = []() { |
528 | 0 | ggml_cpu_init(); |
529 | |
|
530 | 0 | std::vector<ggml_backend_feature> features; |
531 | 0 | if (ggml_cpu_has_sse3()) { |
532 | 0 | features.push_back({ "SSE3", "1" }); |
533 | 0 | } |
534 | 0 | if (ggml_cpu_has_ssse3()) { |
535 | 0 | features.push_back({ "SSSE3", "1" }); |
536 | 0 | } |
537 | 0 | if (ggml_cpu_has_avx()) { |
538 | 0 | features.push_back({ "AVX", "1" }); |
539 | 0 | } |
540 | 0 | if (ggml_cpu_has_avx_vnni()) { |
541 | 0 | features.push_back({ "AVX_VNNI", "1" }); |
542 | 0 | } |
543 | 0 | if (ggml_cpu_has_avx2()) { |
544 | 0 | features.push_back({ "AVX2", "1" }); |
545 | 0 | } |
546 | 0 | if (ggml_cpu_has_f16c()) { |
547 | 0 | features.push_back({ "F16C", "1" }); |
548 | 0 | } |
549 | 0 | if (ggml_cpu_has_fma()) { |
550 | 0 | features.push_back({ "FMA", "1" }); |
551 | 0 | } |
552 | 0 | if (ggml_cpu_has_bmi2()) { |
553 | 0 | features.push_back({ "BMI2", "1" }); |
554 | 0 | } |
555 | 0 | if (ggml_cpu_has_avx512()) { |
556 | 0 | features.push_back({ "AVX512", "1" }); |
557 | 0 | } |
558 | 0 | if (ggml_cpu_has_avx512_vbmi()) { |
559 | 0 | features.push_back({ "AVX512_VBMI", "1" }); |
560 | 0 | } |
561 | 0 | if (ggml_cpu_has_avx512_vnni()) { |
562 | 0 | features.push_back({ "AVX512_VNNI", "1" }); |
563 | 0 | } |
564 | 0 | if (ggml_cpu_has_avx512_bf16()) { |
565 | 0 | features.push_back({ "AVX512_BF16", "1" }); |
566 | 0 | } |
567 | 0 | if (ggml_cpu_has_amx_int8()) { |
568 | 0 | features.push_back({ "AMX_INT8", "1" }); |
569 | 0 | } |
570 | 0 | if (ggml_cpu_has_neon()) { |
571 | 0 | features.push_back({ "NEON", "1" }); |
572 | 0 | } |
573 | 0 | if (ggml_cpu_has_arm_fma()) { |
574 | 0 | features.push_back({ "ARM_FMA", "1" }); |
575 | 0 | } |
576 | 0 | if (ggml_cpu_has_fp16_va()) { |
577 | 0 | features.push_back({ "FP16_VA", "1" }); |
578 | 0 | } |
579 | 0 | if (ggml_cpu_has_matmul_int8()) { |
580 | 0 | features.push_back({ "MATMUL_INT8", "1" }); |
581 | 0 | } |
582 | 0 | if (ggml_cpu_has_sve()) { |
583 | 0 | features.push_back({ "SVE", "1" }); |
584 | 0 | } |
585 | 0 | if (ggml_cpu_has_dotprod()) { |
586 | 0 | features.push_back({ "DOTPROD", "1" }); |
587 | 0 | } |
588 | 0 | if (ggml_cpu_get_sve_cnt() > 0) { |
589 | 0 | static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt()); |
590 | 0 | features.push_back({ "SVE_CNT", sve_cnt.c_str() }); |
591 | 0 | } |
592 | 0 | if (ggml_cpu_has_sme()) { |
593 | 0 | features.push_back({ "SME", "1" }); |
594 | 0 | } |
595 | 0 | if (ggml_cpu_has_riscv_v()) { |
596 | 0 | features.push_back({ "RISCV_V", "1" }); |
597 | 0 | } |
598 | 0 | if (ggml_cpu_get_rvv_vlen() > 0) { |
599 | 0 | static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen()); |
600 | 0 | features.push_back({ "RVV_VLEN", rvv_vlen.c_str() }); |
601 | 0 | } |
602 | 0 | if (ggml_cpu_has_vsx()) { |
603 | 0 | features.push_back({ "VSX", "1" }); |
604 | 0 | } |
605 | 0 | if (ggml_cpu_has_vxe()) { |
606 | 0 | features.push_back({ "VXE", "1" }); |
607 | 0 | } |
608 | 0 | if (ggml_cpu_has_wasm_simd()) { |
609 | 0 | features.push_back({ "WASM_SIMD", "1" }); |
610 | 0 | } |
611 | 0 | if (ggml_cpu_has_llamafile()) { |
612 | 0 | features.push_back({ "LLAMAFILE", "1" }); |
613 | 0 | } |
614 | | #ifdef GGML_USE_ACCELERATE |
615 | | features.push_back({ "ACCELERATE", "1" }); |
616 | | #endif |
617 | | #ifdef GGML_USE_CPU_HBM |
618 | | features.push_back({ "CPU_HBM", "1" }); |
619 | | #endif |
620 | | #ifdef GGML_USE_OPENMP |
621 | | features.push_back({ "OPENMP", "1" }); |
622 | | #endif |
623 | | #ifdef GGML_USE_CPU_KLEIDIAI |
624 | | features.push_back({ "KLEIDIAI", "1" }); |
625 | | #endif |
626 | 0 | #ifdef GGML_USE_CPU_REPACK |
627 | 0 | features.push_back({ "REPACK", "1" }); |
628 | 0 | #endif |
629 | |
|
630 | 0 | features.push_back({ nullptr, nullptr }); |
631 | |
|
632 | 0 | return features; |
633 | 0 | }(); |
634 | |
|
635 | 0 | return features.data(); |
636 | | |
637 | 0 | GGML_UNUSED(reg); |
638 | 0 | } |
639 | | |
640 | 0 | static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { |
641 | 0 | if (strcmp(name, "ggml_backend_set_n_threads") == 0) { |
642 | 0 | ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads; |
643 | 0 | return (void *)fct; |
644 | 0 | } |
645 | 0 | if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { |
646 | 0 | ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type; |
647 | 0 | return (void *)fct; |
648 | 0 | } |
649 | 0 | if (strcmp(name, "ggml_backend_get_features") == 0) { |
650 | 0 | return (void *)ggml_backend_cpu_get_features; |
651 | 0 | } |
652 | 0 | if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { |
653 | 0 | return (void *)ggml_backend_cpu_set_abort_callback; |
654 | 0 | } |
655 | 0 | if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { |
656 | 0 | return (void *)ggml_numa_init; |
657 | 0 | } |
658 | 0 | if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { |
659 | 0 | return (void *)ggml_is_numa; |
660 | 0 | } |
661 | 0 | if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) { |
662 | 0 | return (void *)ggml_backend_cpu_set_use_ref; |
663 | 0 | } |
664 | | |
665 | | // threadpool - TODO: move to ggml-base |
666 | 0 | if (strcmp(name, "ggml_threadpool_new") == 0) { |
667 | 0 | return (void *)ggml_threadpool_new; |
668 | 0 | } |
669 | 0 | if (strcmp(name, "ggml_threadpool_free") == 0) { |
670 | 0 | return (void *)ggml_threadpool_free; |
671 | 0 | } |
672 | 0 | if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { |
673 | 0 | return (void *)ggml_backend_cpu_set_threadpool; |
674 | 0 | } |
675 | | |
676 | 0 | return NULL; |
677 | | |
678 | 0 | GGML_UNUSED(reg); |
679 | 0 | } |
680 | | |
681 | | static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { |
682 | | /* .get_name = */ ggml_backend_cpu_reg_get_name, |
683 | | /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, |
684 | | /* .get_device = */ ggml_backend_cpu_reg_get_device, |
685 | | /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, |
686 | | }; |
687 | | |
688 | 1 | ggml_backend_reg_t ggml_backend_cpu_reg(void) { |
689 | | // init CPU feature detection |
690 | 1 | ggml_cpu_init(); |
691 | | |
692 | 1 | static struct ggml_backend_reg ggml_backend_cpu_reg = { |
693 | 1 | /* .api_version = */ GGML_BACKEND_API_VERSION, |
694 | 1 | /* .iface = */ ggml_backend_cpu_reg_i, |
695 | 1 | /* .context = */ NULL, |
696 | 1 | }; |
697 | | |
698 | 1 | return &ggml_backend_cpu_reg; |
699 | 1 | } |
700 | | |
701 | | GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) |