/src/llama.cpp/ggml/src/ggml-alloc.c
Line | Count | Source |
1 | | #include "ggml-alloc.h" |
2 | | #include "ggml-backend-impl.h" |
3 | | #include "ggml.h" |
4 | | #include "ggml-impl.h" |
5 | | #include <assert.h> |
6 | | #include <limits.h> |
7 | | #include <stdarg.h> |
8 | | #include <stdio.h> |
9 | | #include <stdlib.h> |
10 | | #include <string.h> |
11 | | |
12 | 0 | #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
13 | | #define MAX_FREE_BLOCKS 256 |
14 | | |
15 | | //#define GGML_ALLOCATOR_DEBUG |
16 | | |
17 | | //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) |
18 | | #define AT_PRINTF(...) |
19 | | |
20 | | |
21 | 0 | static bool ggml_is_view(const struct ggml_tensor * t) { |
22 | 0 | return t->view_src != NULL; |
23 | 0 | } |
24 | | |
25 | | // ops that return true for this function must not use restrict pointers for their backend implementations |
26 | 0 | bool ggml_op_can_inplace(enum ggml_op op) { |
27 | 0 | switch (op) { |
28 | 0 | case GGML_OP_FILL: |
29 | 0 | case GGML_OP_SCALE: |
30 | 0 | case GGML_OP_DIAG_MASK_ZERO: |
31 | 0 | case GGML_OP_DIAG_MASK_INF: |
32 | 0 | case GGML_OP_ADD: |
33 | 0 | case GGML_OP_ADD_ID: |
34 | 0 | case GGML_OP_ADD1: |
35 | 0 | case GGML_OP_SUB: |
36 | 0 | case GGML_OP_MUL: |
37 | 0 | case GGML_OP_DIV: |
38 | 0 | case GGML_OP_SQR: |
39 | 0 | case GGML_OP_SQRT: |
40 | 0 | case GGML_OP_LOG: |
41 | 0 | case GGML_OP_UNARY: |
42 | 0 | case GGML_OP_ROPE: |
43 | 0 | case GGML_OP_ROPE_BACK: |
44 | 0 | case GGML_OP_SILU_BACK: |
45 | 0 | case GGML_OP_RMS_NORM: |
46 | 0 | case GGML_OP_RMS_NORM_BACK: |
47 | 0 | case GGML_OP_SOFT_MAX: |
48 | 0 | case GGML_OP_SOFT_MAX_BACK: |
49 | 0 | return true; |
50 | | |
51 | 0 | default: |
52 | 0 | return false; |
53 | 0 | } |
54 | 0 | } |
55 | | |
56 | 0 | static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { |
57 | 0 | assert(alignment && !(alignment & (alignment - 1))); // power of 2 |
58 | 0 | size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; |
59 | 0 | return offset + align; |
60 | 0 | } |
61 | | |
62 | | // tallocr |
63 | | |
64 | 0 | struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) { |
65 | 0 | void * base = ggml_backend_buffer_get_base(buffer); |
66 | 0 | size_t align = ggml_backend_buffer_get_alignment(buffer); |
67 | |
|
68 | 0 | assert(align && !(align & (align - 1))); // power of 2 |
69 | |
|
70 | 0 | struct ggml_tallocr talloc = (struct ggml_tallocr) { |
71 | 0 | /*.buffer = */ buffer, |
72 | 0 | /*.base = */ base, |
73 | 0 | /*.alignment = */ align, |
74 | 0 | /*.offset = */ aligned_offset(base, 0, align), |
75 | 0 | }; |
76 | 0 | return talloc; |
77 | 0 | } |
78 | | |
79 | 0 | enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { |
80 | 0 | size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); |
81 | 0 | size = GGML_PAD(size, talloc->alignment); |
82 | |
|
83 | 0 | if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { |
84 | 0 | GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", |
85 | 0 | __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); |
86 | 0 | GGML_ABORT("not enough space in the buffer"); |
87 | 0 | } |
88 | |
|
89 | 0 | void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; |
90 | 0 | talloc->offset += size; |
91 | |
|
92 | 0 | assert(((uintptr_t)addr % talloc->alignment) == 0); |
93 | |
|
94 | 0 | return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); |
95 | 0 | } |
96 | | |
97 | | // dynamic tensor allocator |
98 | | |
99 | 0 | #define GGML_VBUFFER_MAX_CHUNKS 16 |
100 | | |
101 | | // relative memory address within an allocation that can be split into multiple buffers (chunks) |
102 | | struct buffer_address { |
103 | | int chunk; // index of a backend buffer |
104 | | size_t offset; // local memory offset within the buffer |
105 | | }; |
106 | | |
107 | | static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX }; |
108 | | |
109 | 0 | static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) { |
110 | 0 | return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset; |
111 | 0 | } |
112 | | |
113 | | struct free_block { |
114 | | size_t offset; |
115 | | size_t size; |
116 | | }; |
117 | | |
118 | | struct tallocr_chunk { |
119 | | struct free_block free_blocks[MAX_FREE_BLOCKS]; |
120 | | int n_free_blocks; |
121 | | size_t max_size; |
122 | | }; |
123 | | |
124 | | struct ggml_dyn_tallocr { |
125 | | size_t alignment; |
126 | | size_t max_chunk_size; |
127 | | struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS]; |
128 | | int n_chunks; |
129 | | |
130 | | #ifdef GGML_ALLOCATOR_DEBUG |
131 | | struct { |
132 | | const struct ggml_tensor * tensor; |
133 | | struct buffer_address addr; |
134 | | } allocated_tensors[1024]; |
135 | | #endif |
136 | | }; |
137 | | |
138 | 0 | static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) { |
139 | 0 | GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); |
140 | | // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) |
141 | 0 | int insert_pos = 0; |
142 | 0 | while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) { |
143 | 0 | insert_pos++; |
144 | 0 | } |
145 | | // shift all blocks from insert_pos onward to make room for the new block |
146 | 0 | for (int i = chunk->n_free_blocks; i > insert_pos; i--) { |
147 | 0 | chunk->free_blocks[i] = chunk->free_blocks[i-1]; |
148 | 0 | } |
149 | | // insert the new block |
150 | 0 | chunk->free_blocks[insert_pos].offset = offset; |
151 | 0 | chunk->free_blocks[insert_pos].size = size; |
152 | 0 | chunk->n_free_blocks++; |
153 | 0 | } |
154 | | |
155 | 0 | static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) { |
156 | | // shift all elements after idx by 1 to the left, overwriting the element at idx |
157 | 0 | for (int i = idx; i < chunk->n_free_blocks; i++) { |
158 | 0 | chunk->free_blocks[i] = chunk->free_blocks[i+1]; |
159 | 0 | } |
160 | 0 | chunk->n_free_blocks--; |
161 | 0 | } |
162 | | |
163 | 0 | static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) { |
164 | 0 | if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) { |
165 | 0 | return -1; |
166 | 0 | } |
167 | 0 | struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk)); |
168 | 0 | chunk->n_free_blocks = 1; |
169 | 0 | chunk->free_blocks[0].offset = 0; |
170 | | // available space in a chunk is limited to max_chunk_size, but can be higher if: |
171 | | // 1. a single tensor exceeds the maximum, and cannot fit any other way |
172 | | // 2. we are running out of chunks |
173 | | // backends will either manage to allocate the larger size, or report an error. |
174 | 0 | chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size); |
175 | 0 | if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) { |
176 | 0 | chunk->free_blocks[0].size = SIZE_MAX/2; |
177 | 0 | } |
178 | 0 | alloc->chunks[alloc->n_chunks] = chunk; |
179 | 0 | alloc->n_chunks++; |
180 | 0 | return alloc->n_chunks - 1; |
181 | 0 | } |
182 | | |
183 | | #ifdef GGML_ALLOCATOR_DEBUG |
184 | | static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) { |
185 | | for (int i = 0; i < 1024; i++) { |
186 | | if (alloc->allocated_tensors[i].tensor == NULL) { |
187 | | alloc->allocated_tensors[i].tensor = tensor; |
188 | | alloc->allocated_tensors[i].addr = addr; |
189 | | return; |
190 | | } |
191 | | } |
192 | | GGML_ABORT("out of allocated_tensors"); |
193 | | } |
194 | | static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) { |
195 | | for (int i = 0; i < 1024; i++) { |
196 | | if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) { |
197 | | alloc->allocated_tensors[i].tensor = NULL; |
198 | | return; |
199 | | } |
200 | | } |
201 | | GGML_ABORT("tried to free tensor %s not found\n", tensor->name); |
202 | | } |
203 | | #endif |
204 | | |
205 | 0 | static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { |
206 | 0 | size = aligned_offset(NULL, size, alloc->alignment); |
207 | |
|
208 | 0 | AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); |
209 | |
|
210 | 0 | int best_fit_chunk = -1; |
211 | 0 | int best_fit_block = -1; |
212 | 0 | size_t max_avail = 0; |
213 | | |
214 | | // find the best fitting free block besides the last block, within any chunk |
215 | 0 | for (int c = 0; c < alloc->n_chunks; ++c) { |
216 | 0 | struct tallocr_chunk * chunk = alloc->chunks[c]; |
217 | 0 | size_t best_fit_size = SIZE_MAX; |
218 | 0 | for (int i = 0; i < chunk->n_free_blocks - 1; i++) { |
219 | 0 | struct free_block * block = &chunk->free_blocks[i]; |
220 | 0 | max_avail = MAX(max_avail, block->size); |
221 | 0 | if (block->size >= size && block->size <= best_fit_size) { |
222 | 0 | best_fit_chunk = c; |
223 | 0 | best_fit_block = i; |
224 | 0 | best_fit_size = block->size; |
225 | 0 | } |
226 | 0 | } |
227 | 0 | } |
228 | |
|
229 | 0 | if (best_fit_block == -1) { |
230 | | // no suitable block found, try the last block (this may grow a chunks size) |
231 | 0 | int64_t best_reuse = INT64_MIN; |
232 | 0 | for (int c = 0; c < alloc->n_chunks; ++c) { |
233 | 0 | struct tallocr_chunk * chunk = alloc->chunks[c]; |
234 | 0 | if (chunk->n_free_blocks > 0) { |
235 | 0 | struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1]; |
236 | 0 | max_avail = MAX(max_avail, block->size); |
237 | 0 | int64_t reuse_factor = chunk->max_size - block->offset - size; |
238 | | // reuse_factor < 0 : amount of extra memory that needs to be allocated |
239 | | // reuse_factor = 0 : allocated free space exactly matches tensor size |
240 | | // reuse_factor > 0 : superfluous memory that will remain unused |
241 | 0 | bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse; |
242 | 0 | bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse; |
243 | 0 | if (block->size >= size && (better_reuse || better_fit)) { |
244 | 0 | best_fit_chunk = c; |
245 | 0 | best_fit_block = chunk->n_free_blocks - 1; |
246 | 0 | best_reuse = reuse_factor; |
247 | 0 | } |
248 | 0 | } |
249 | 0 | } |
250 | 0 | } |
251 | |
|
252 | 0 | if (best_fit_block == -1) { |
253 | | // none of the existing chunks have enough space left |
254 | 0 | best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size); |
255 | 0 | best_fit_block = 0; |
256 | 0 | } |
257 | 0 | if (best_fit_chunk == -1) { |
258 | | // since the last chunk always has virtually endless memory, this should never happen |
259 | 0 | GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", |
260 | 0 | __func__, size, max_avail); |
261 | 0 | GGML_ABORT("graph allocation: failed to reserve memory"); |
262 | 0 | } |
263 | |
|
264 | 0 | struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk]; |
265 | 0 | struct free_block * block = &chunk->free_blocks[best_fit_block]; |
266 | 0 | struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset }; |
267 | 0 | block->offset += size; |
268 | 0 | block->size -= size; |
269 | 0 | if (block->size == 0) { |
270 | | // remove block if empty |
271 | 0 | ggml_dyn_tallocr_remove_block(chunk, best_fit_block); |
272 | 0 | } |
273 | |
|
274 | 0 | AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk); |
275 | |
|
276 | | #ifdef GGML_ALLOCATOR_DEBUG |
277 | | add_allocated_tensor(alloc, addr, tensor); |
278 | | size_t cur_max = addr.offset + size; |
279 | | if (cur_max > chunk->max_size) { |
280 | | // sort allocated_tensors by chunk/offset |
281 | | for (int i = 0; i < 1024; i++) { |
282 | | for (int j = i + 1; j < 1024; j++) { |
283 | | if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) { |
284 | | const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; |
285 | | struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr; |
286 | | alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; |
287 | | alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr; |
288 | | alloc->allocated_tensors[j].tensor = tmp_tensor; |
289 | | alloc->allocated_tensors[j].addr = tmp_addr; |
290 | | } |
291 | | } |
292 | | } |
293 | | GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0); |
294 | | for (int i = 0; i < 1024; i++) { |
295 | | if (alloc->allocated_tensors[i].tensor) { |
296 | | GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, |
297 | | alloc->allocated_tensors[i].addr.chunk, |
298 | | alloc->allocated_tensors[i].addr.offset, |
299 | | alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), |
300 | | ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); |
301 | | } |
302 | | } |
303 | | GGML_LOG_DEBUG("\n"); |
304 | | } |
305 | | #endif |
306 | |
|
307 | 0 | chunk->max_size = MAX(chunk->max_size, addr.offset + size); |
308 | |
|
309 | 0 | return addr; |
310 | | |
311 | 0 | GGML_UNUSED(tensor); |
312 | 0 | } |
313 | | |
314 | | // this is a very naive implementation, but for our case the number of free blocks should be very small |
315 | 0 | static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) { |
316 | 0 | size = aligned_offset(NULL, size, alloc->alignment); |
317 | |
|
318 | 0 | struct tallocr_chunk * chunk = alloc->chunks[addr.chunk]; |
319 | | |
320 | | // see if we can merge with an existing block |
321 | 0 | for (int i = 0; i < chunk->n_free_blocks; i++) { |
322 | 0 | struct free_block * block = &chunk->free_blocks[i]; |
323 | | // check if ptr is at the end of the block |
324 | 0 | if (block->offset + block->size == addr.offset) { |
325 | 0 | block->size += size; |
326 | | // check if we can merge with the next block |
327 | 0 | if (i < chunk->n_free_blocks - 1) { |
328 | 0 | struct free_block * next = &chunk->free_blocks[i+1]; |
329 | 0 | if (block->offset + block->size == next->offset) { |
330 | 0 | block->size += next->size; |
331 | 0 | ggml_dyn_tallocr_remove_block(chunk, i+1); |
332 | 0 | } |
333 | 0 | } |
334 | 0 | return; |
335 | 0 | } |
336 | | // check if ptr is at the beginning of the block |
337 | 0 | if (addr.offset + size == block->offset) { |
338 | 0 | block->offset = addr.offset; |
339 | 0 | block->size += size; |
340 | | // check if we can merge with the previous block |
341 | 0 | if (i > 0) { |
342 | 0 | struct free_block * prev = &chunk->free_blocks[i-1]; |
343 | 0 | if (prev->offset + prev->size == block->offset) { |
344 | 0 | prev->size += block->size; |
345 | 0 | ggml_dyn_tallocr_remove_block(chunk, i); |
346 | 0 | } |
347 | 0 | } |
348 | 0 | return; |
349 | 0 | } |
350 | 0 | } |
351 | | // otherwise, add a new block |
352 | 0 | ggml_dyn_tallocr_insert_block(chunk, addr.offset, size); |
353 | 0 | } |
354 | | |
355 | 0 | static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { |
356 | 0 | for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) { |
357 | 0 | free(alloc->chunks[i]); |
358 | 0 | alloc->chunks[i] = NULL; |
359 | 0 | } |
360 | 0 | alloc->n_chunks = 0; |
361 | |
|
362 | | #ifdef GGML_ALLOCATOR_DEBUG |
363 | | for (int i = 0; i < 1024; i++) { |
364 | | alloc->allocated_tensors[i].tensor = NULL; |
365 | | } |
366 | | #endif |
367 | 0 | } |
368 | | |
369 | 0 | static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) { |
370 | 0 | struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); |
371 | |
|
372 | 0 | *alloc = (struct ggml_dyn_tallocr) { |
373 | 0 | /*.alignment = */ alignment, |
374 | 0 | /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows |
375 | 0 | /*.chunks = */ {NULL}, |
376 | 0 | /*.n_chunks = */ 0, |
377 | | #ifdef GGML_ALLOCATOR_DEBUG |
378 | | /*.allocated_tensors = */ {{0}}, |
379 | | #endif |
380 | 0 | }; |
381 | |
|
382 | 0 | ggml_dyn_tallocr_reset(alloc); |
383 | |
|
384 | 0 | return alloc; |
385 | 0 | } |
386 | | |
387 | 0 | static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { |
388 | 0 | for (int i = 0; i < alloc->n_chunks; ++i) { |
389 | 0 | free(alloc->chunks[i]); |
390 | 0 | } |
391 | 0 | free(alloc); |
392 | 0 | } |
393 | | |
394 | 0 | static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) { |
395 | 0 | return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0; |
396 | 0 | } |
397 | | |
398 | | |
399 | | // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks) |
400 | | |
401 | | struct vbuffer { |
402 | | ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS]; |
403 | | }; |
404 | | |
405 | 0 | static void ggml_vbuffer_free(struct vbuffer * buf) { |
406 | 0 | if (buf == NULL) { |
407 | 0 | return; |
408 | 0 | } |
409 | 0 | for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) { |
410 | 0 | ggml_backend_buffer_free(buf->chunks[i]); |
411 | 0 | } |
412 | 0 | free(buf); |
413 | 0 | } |
414 | | |
415 | 0 | static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) { |
416 | 0 | return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0; |
417 | 0 | } |
418 | | |
419 | 0 | static size_t ggml_vbuffer_size(struct vbuffer * buf) { |
420 | 0 | size_t size = 0; |
421 | 0 | for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) { |
422 | 0 | size += ggml_backend_buffer_get_size(buf->chunks[i]); |
423 | 0 | } |
424 | 0 | return size; |
425 | 0 | } |
426 | | |
427 | 0 | static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) { |
428 | 0 | struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer)); |
429 | 0 | if (buf == NULL) { |
430 | 0 | return NULL; |
431 | 0 | } |
432 | | |
433 | 0 | for (int n = 0; n < talloc->n_chunks; n++) { |
434 | 0 | size_t chunk_size = talloc->chunks[n]->max_size; |
435 | 0 | buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size); |
436 | 0 | if (buf->chunks[n] == NULL) { |
437 | 0 | ggml_vbuffer_free(buf); |
438 | 0 | return NULL; |
439 | 0 | } |
440 | 0 | ggml_backend_buffer_set_usage(buf->chunks[n], usage); |
441 | 0 | } |
442 | 0 | return buf; |
443 | 0 | } |
444 | | |
445 | 0 | static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) { |
446 | 0 | void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]); |
447 | 0 | void * addr = (char *)base + buf_addr.offset; |
448 | 0 | ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr); |
449 | 0 | } |
450 | | |
451 | 0 | static void ggml_vbuffer_reset(struct vbuffer * buf) { |
452 | 0 | for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) { |
453 | 0 | ggml_backend_buffer_reset(buf->chunks[i]); |
454 | 0 | } |
455 | 0 | } |
456 | | |
457 | | |
458 | | ///////////////////////////////////// |
459 | | |
460 | | // graph allocator |
461 | | |
462 | | struct hash_node { |
463 | | int n_children; |
464 | | int n_views; |
465 | | int buffer_id; |
466 | | struct buffer_address addr; |
467 | | bool allocated; |
468 | | }; |
469 | | |
470 | | struct tensor_alloc { |
471 | | int buffer_id; |
472 | | struct buffer_address addr; |
473 | | size_t size_max; // 0 = pre-allocated, unused, or view |
474 | | }; |
475 | | |
476 | | struct leaf_alloc { |
477 | | struct tensor_alloc leaf; |
478 | | }; |
479 | | |
480 | | struct node_alloc { |
481 | | struct tensor_alloc dst; |
482 | | struct tensor_alloc src[GGML_MAX_SRC]; |
483 | | }; |
484 | | |
485 | | struct ggml_gallocr { |
486 | | ggml_backend_buffer_type_t * bufts; // [n_buffers] |
487 | | struct vbuffer ** buffers; // [n_buffers] |
488 | | struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] |
489 | | int n_buffers; |
490 | | |
491 | | struct ggml_hash_set hash_set; |
492 | | struct hash_node * hash_values; // [hash_set.size] |
493 | | |
494 | | struct node_alloc * node_allocs; // [n_nodes] |
495 | | int n_nodes; |
496 | | |
497 | | struct leaf_alloc * leaf_allocs; // [n_leafs] |
498 | | int n_leafs; |
499 | | }; |
500 | | |
501 | 0 | ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { |
502 | 0 | ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); |
503 | 0 | GGML_ASSERT(galloc != NULL); |
504 | |
|
505 | 0 | galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); |
506 | 0 | GGML_ASSERT(galloc->bufts != NULL); |
507 | |
|
508 | 0 | galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *)); |
509 | 0 | GGML_ASSERT(galloc->buffers != NULL); |
510 | |
|
511 | 0 | galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); |
512 | 0 | GGML_ASSERT(galloc->buf_tallocs != NULL); |
513 | |
|
514 | 0 | for (int i = 0; i < n_bufs; i++) { |
515 | 0 | galloc->bufts[i] = bufts[i]; |
516 | 0 | galloc->buffers[i] = NULL; |
517 | | |
518 | | // check if the same buffer type is used multiple times and reuse the same allocator |
519 | 0 | for (int j = 0; j < i; j++) { |
520 | 0 | if (bufts[i] == bufts[j]) { |
521 | 0 | galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; |
522 | 0 | break; |
523 | 0 | } |
524 | 0 | } |
525 | |
|
526 | 0 | if (galloc->buf_tallocs[i] == NULL) { |
527 | 0 | size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); |
528 | 0 | size_t max_size = ggml_backend_buft_get_max_size(bufts[i]); |
529 | 0 | galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size); |
530 | 0 | } |
531 | 0 | } |
532 | 0 | galloc->n_buffers = n_bufs; |
533 | |
|
534 | 0 | return galloc; |
535 | 0 | } |
536 | | |
537 | 0 | ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) { |
538 | 0 | return ggml_gallocr_new_n(&buft, 1); |
539 | 0 | } |
540 | | |
541 | 0 | void ggml_gallocr_free(ggml_gallocr_t galloc) { |
542 | 0 | if (galloc == NULL) { |
543 | 0 | return; |
544 | 0 | } |
545 | | |
546 | 0 | for (int i = 0; i < galloc->n_buffers; i++) { |
547 | 0 | if (galloc->buffers != NULL) { |
548 | | // skip if already freed |
549 | 0 | bool freed = false; |
550 | 0 | for (int j = 0; j < i; j++) { |
551 | 0 | if (galloc->buffers[j] == galloc->buffers[i]) { |
552 | 0 | freed = true; |
553 | 0 | break; |
554 | 0 | } |
555 | 0 | } |
556 | 0 | if (!freed) { |
557 | 0 | ggml_vbuffer_free(galloc->buffers[i]); |
558 | 0 | } |
559 | 0 | } |
560 | 0 | if (galloc->buf_tallocs != NULL) { |
561 | | // skip if already freed |
562 | 0 | bool freed = false; |
563 | 0 | for (int j = 0; j < i; j++) { |
564 | 0 | if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { |
565 | 0 | freed = true; |
566 | 0 | break; |
567 | 0 | } |
568 | 0 | } |
569 | 0 | if (!freed) { |
570 | 0 | ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); |
571 | 0 | } |
572 | 0 | } |
573 | 0 | } |
574 | |
|
575 | 0 | ggml_hash_set_free(&galloc->hash_set); |
576 | 0 | free(galloc->hash_values); |
577 | 0 | free(galloc->bufts); |
578 | 0 | free(galloc->buffers); |
579 | 0 | free(galloc->buf_tallocs); |
580 | 0 | free(galloc->node_allocs); |
581 | 0 | free(galloc->leaf_allocs); |
582 | 0 | free(galloc); |
583 | 0 | } |
584 | | |
585 | | typedef struct ggml_gallocr * ggml_gallocr_t; |
586 | | |
587 | 0 | static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { |
588 | 0 | size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); |
589 | 0 | return &galloc->hash_values[i]; |
590 | 0 | } |
591 | | |
592 | 0 | static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { |
593 | 0 | return ggml_gallocr_hash_get(galloc, t)->allocated; |
594 | 0 | } |
595 | | |
596 | 0 | static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { |
597 | 0 | return t->data != NULL // tensor data already set externally |
598 | 0 | || t->buffer // tensor on external buffer (but not yet allocated) |
599 | 0 | || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc |
600 | 0 | } |
601 | | |
602 | | // free the extra space at the end if the new tensor is smaller |
603 | 0 | static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) { |
604 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); |
605 | 0 | struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); |
606 | |
|
607 | 0 | size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent); |
608 | 0 | size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); |
609 | |
|
610 | 0 | GGML_ASSERT(parent_size >= node_size); |
611 | | |
612 | | // note: we want after the freeing the chunks to continue to be aligned |
613 | 0 | struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id]; |
614 | 0 | parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment); |
615 | 0 | node_size = aligned_offset(NULL, node_size, p_alloc->alignment); |
616 | |
|
617 | 0 | if (parent_size > node_size) { |
618 | 0 | struct buffer_address p_addr = p_hn->addr; |
619 | 0 | p_addr.offset += node_size; |
620 | 0 | size_t extra_size = parent_size - node_size; |
621 | 0 | AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name); |
622 | 0 | ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size); |
623 | 0 | } |
624 | 0 | } |
625 | | |
626 | 0 | static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { |
627 | 0 | GGML_ASSERT(buffer_id >= 0); |
628 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); |
629 | |
|
630 | 0 | if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { |
631 | 0 | hn->allocated = true; |
632 | 0 | assert(hn->addr.offset == 0); |
633 | | |
634 | | // try to reuse a parent's buffer (inplace) |
635 | 0 | if (ggml_op_can_inplace(node->op)) { |
636 | 0 | for (int i = 0; i < GGML_MAX_SRC; i++) { |
637 | 0 | struct ggml_tensor * parent = node->src[i]; |
638 | 0 | if (parent == NULL) { |
639 | 0 | continue; |
640 | 0 | } |
641 | | |
642 | | // if the node's data is external, then we cannot re-use it |
643 | 0 | if (!ggml_gallocr_is_own(galloc, parent)) { |
644 | 0 | AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); |
645 | 0 | continue; |
646 | 0 | } |
647 | | |
648 | | // outputs cannot be reused |
649 | 0 | if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { |
650 | 0 | AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); |
651 | 0 | continue; |
652 | 0 | } |
653 | | |
654 | 0 | if (!ggml_are_same_layout(node, parent)) { |
655 | 0 | AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); |
656 | 0 | continue; |
657 | 0 | } |
658 | | |
659 | 0 | struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); |
660 | 0 | if (p_hn->n_children == 1 && p_hn->n_views == 0) { |
661 | 0 | if (ggml_is_view(parent)) { |
662 | 0 | struct ggml_tensor * view_src = parent->view_src; |
663 | 0 | struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); |
664 | 0 | if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { |
665 | 0 | AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); |
666 | 0 | assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset); |
667 | 0 | hn->buffer_id = p_hn->buffer_id; |
668 | 0 | hn->addr = p_hn->addr; |
669 | 0 | p_hn->allocated = false; // avoid freeing the parent |
670 | 0 | view_src_hn->allocated = false; |
671 | 0 | ggml_gallocr_free_extra_space(galloc, node, view_src); |
672 | 0 | return; |
673 | 0 | } |
674 | 0 | } else { |
675 | 0 | AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); |
676 | 0 | hn->buffer_id = p_hn->buffer_id; |
677 | 0 | hn->addr = p_hn->addr; |
678 | 0 | p_hn->allocated = false; // avoid freeing the parent |
679 | 0 | ggml_gallocr_free_extra_space(galloc, node, parent); |
680 | 0 | return; |
681 | 0 | } |
682 | 0 | } |
683 | 0 | } |
684 | 0 | } |
685 | | // allocate tensor from the buffer |
686 | 0 | struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; |
687 | 0 | ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; |
688 | 0 | size_t size = ggml_backend_buft_get_alloc_size(buft, node); |
689 | 0 | hn->buffer_id = buffer_id; |
690 | 0 | hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node); |
691 | 0 | } |
692 | 0 | } |
693 | | |
694 | 0 | static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { |
695 | | // graph outputs are never freed |
696 | 0 | if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { |
697 | 0 | AT_PRINTF("not freeing output %s\n", node->name); |
698 | 0 | return; |
699 | 0 | } |
700 | | |
701 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); |
702 | 0 | int buffer_id = hn->buffer_id; |
703 | 0 | struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; |
704 | 0 | ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; |
705 | 0 | size_t size = ggml_backend_buft_get_alloc_size(buft, node); |
706 | |
|
707 | 0 | AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", |
708 | 0 | __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks); |
709 | | #ifdef GGML_ALLOCATOR_DEBUG |
710 | | remove_allocated_tensor(alloc, hn->addr, node); |
711 | | #endif |
712 | |
|
713 | 0 | ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size); |
714 | 0 | hn->allocated = false; |
715 | 0 | } |
716 | | |
717 | 0 | static int get_node_buffer_id(const int * node_buffer_ids, int i) { |
718 | 0 | return node_buffer_ids ? node_buffer_ids[i] : 0; |
719 | 0 | } |
720 | | |
721 | 0 | static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { |
722 | | // clear hash tables |
723 | 0 | ggml_hash_set_reset(&galloc->hash_set); |
724 | 0 | memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); |
725 | | |
726 | | // allocate leafs |
727 | | // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes |
728 | 0 | for (int i = 0; i < graph->n_leafs; i++) { |
729 | 0 | struct ggml_tensor * leaf = graph->leafs[i]; |
730 | 0 | ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i)); |
731 | 0 | } |
732 | | |
733 | | // count number of children and views |
734 | | // allocate other graph inputs and leafs first to avoid overwriting them |
735 | 0 | for (int i = 0; i < graph->n_nodes; i++) { |
736 | 0 | struct ggml_tensor * node = graph->nodes[i]; |
737 | | |
738 | | // TODO: better way to add external dependencies |
739 | | // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to |
740 | | // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node |
741 | | // itself is never used and should not be considered a dependency |
742 | 0 | if (ggml_is_view(node) && node->op != GGML_OP_NONE) { |
743 | 0 | struct ggml_tensor * view_src = node->view_src; |
744 | 0 | ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; |
745 | 0 | } |
746 | |
|
747 | 0 | if (node->flags & GGML_TENSOR_FLAG_INPUT) { |
748 | 0 | ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); |
749 | 0 | } |
750 | |
|
751 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
752 | 0 | struct ggml_tensor * src = node->src[j]; |
753 | 0 | if (src == NULL) { |
754 | 0 | continue; |
755 | 0 | } |
756 | | |
757 | 0 | ggml_gallocr_hash_get(galloc, src)->n_children += 1; |
758 | | |
759 | | // allocate explicit inputs |
760 | 0 | if (src->flags & GGML_TENSOR_FLAG_INPUT) { |
761 | 0 | ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); |
762 | 0 | } |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | | // allocate tensors |
767 | 0 | for (int i = 0; i < graph->n_nodes; i++) { |
768 | 0 | struct ggml_tensor * node = graph->nodes[i]; |
769 | 0 | int buffer_id = get_node_buffer_id(node_buffer_ids, i); |
770 | | |
771 | | // allocate parents (only leafs need to be allocated at this point) |
772 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
773 | 0 | struct ggml_tensor * parent = node->src[j]; |
774 | 0 | if (parent == NULL) { |
775 | 0 | continue; |
776 | 0 | } |
777 | 0 | ggml_gallocr_allocate_node(galloc, parent, buffer_id); |
778 | 0 | } |
779 | | |
780 | | // allocate node |
781 | 0 | ggml_gallocr_allocate_node(galloc, node, buffer_id); |
782 | |
|
783 | 0 | AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name); |
784 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
785 | 0 | struct ggml_tensor * parent = node->src[j]; |
786 | 0 | if (parent == NULL) { |
787 | 0 | continue; |
788 | 0 | } |
789 | 0 | AT_PRINTF("%s", parent->name); |
790 | 0 | if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { |
791 | 0 | AT_PRINTF(", "); |
792 | 0 | } |
793 | 0 | } |
794 | 0 | AT_PRINTF("\n"); |
795 | | |
796 | | // update parents |
797 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
798 | 0 | struct ggml_tensor * parent = node->src[j]; |
799 | 0 | if (parent == NULL) { |
800 | 0 | continue; |
801 | 0 | } |
802 | 0 | struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); |
803 | 0 | p_hn->n_children -= 1; |
804 | |
|
805 | 0 | AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", |
806 | 0 | parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); |
807 | |
|
808 | 0 | if (p_hn->n_children == 0 && p_hn->n_views == 0) { |
809 | 0 | if (ggml_is_view(parent)) { |
810 | 0 | struct ggml_tensor * view_src = parent->view_src; |
811 | 0 | struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); |
812 | 0 | view_src_hn->n_views -= 1; |
813 | 0 | AT_PRINTF("view_src %s: %d children, %d views\n", |
814 | 0 | view_src->name, view_src_hn->n_children, view_src_hn->n_views); |
815 | 0 | if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { |
816 | 0 | ggml_gallocr_free_node(galloc, view_src); |
817 | 0 | } |
818 | 0 | } |
819 | 0 | else if (p_hn->allocated) { |
820 | 0 | ggml_gallocr_free_node(galloc, parent); |
821 | 0 | } |
822 | 0 | } |
823 | 0 | AT_PRINTF("\n"); |
824 | 0 | } |
825 | 0 | } |
826 | 0 | } |
827 | | |
828 | | static bool ggml_gallocr_reserve_n_impl( |
829 | 0 | ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { |
830 | 0 | size_t min_hash_size = graph->n_nodes + graph->n_leafs; |
831 | | // add 25% margin to avoid hash collisions |
832 | 0 | min_hash_size += min_hash_size / 4; |
833 | | |
834 | | // initialize hash table |
835 | 0 | if (galloc->hash_set.size < min_hash_size) { |
836 | 0 | ggml_hash_set_free(&galloc->hash_set); |
837 | 0 | galloc->hash_set = ggml_hash_set_new(min_hash_size); |
838 | 0 | GGML_ASSERT(galloc->hash_set.keys != NULL); |
839 | |
|
840 | 0 | free(galloc->hash_values); |
841 | 0 | galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); |
842 | 0 | GGML_ASSERT(galloc->hash_values != NULL); |
843 | 0 | } |
844 | | |
845 | | // reset allocators |
846 | 0 | for (int i = 0; i < galloc->n_buffers; i++) { |
847 | 0 | ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); |
848 | 0 | } |
849 | | |
850 | | // allocate in hash table |
851 | 0 | ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids); |
852 | | |
853 | | // set the node_allocs from the hash table |
854 | 0 | if (galloc->n_nodes < graph->n_nodes) { |
855 | 0 | free(galloc->node_allocs); |
856 | 0 | galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc)); |
857 | 0 | GGML_ASSERT(galloc->node_allocs != NULL); |
858 | 0 | } |
859 | 0 | galloc->n_nodes = graph->n_nodes; |
860 | 0 | for (int i = 0; i < graph->n_nodes; i++) { |
861 | 0 | struct ggml_tensor * node = graph->nodes[i]; |
862 | 0 | struct node_alloc * node_alloc = &galloc->node_allocs[i]; |
863 | 0 | if (node->view_src || node->data) { |
864 | 0 | node_alloc->dst.buffer_id = -1; |
865 | 0 | node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID; |
866 | 0 | node_alloc->dst.size_max = 0; |
867 | 0 | } else { |
868 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); |
869 | 0 | node_alloc->dst.buffer_id = hn->buffer_id; |
870 | 0 | node_alloc->dst.addr = hn->addr; |
871 | 0 | node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); |
872 | 0 | } |
873 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
874 | 0 | struct ggml_tensor * src = node->src[j]; |
875 | 0 | if (!src || src->view_src || src->data) { |
876 | 0 | node_alloc->src[j].buffer_id = -1; |
877 | 0 | node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID; |
878 | 0 | node_alloc->src[j].size_max = 0; |
879 | 0 | } else { |
880 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); |
881 | 0 | node_alloc->src[j].buffer_id = hn->buffer_id; |
882 | 0 | node_alloc->src[j].addr = hn->addr; |
883 | 0 | node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); |
884 | 0 | } |
885 | 0 | } |
886 | 0 | } |
887 | 0 | if (galloc->n_leafs < graph->n_leafs) { |
888 | 0 | free(galloc->leaf_allocs); |
889 | 0 | galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0])); |
890 | 0 | GGML_ASSERT(galloc->leaf_allocs != NULL); |
891 | 0 | } |
892 | 0 | galloc->n_leafs = graph->n_leafs; |
893 | 0 | for (int i = 0; i < graph->n_leafs; i++) { |
894 | 0 | struct ggml_tensor * leaf = graph->leafs[i]; |
895 | 0 | struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); |
896 | 0 | if (leaf->view_src || leaf->data) { |
897 | 0 | galloc->leaf_allocs[i].leaf.buffer_id = -1; |
898 | 0 | galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID; |
899 | 0 | galloc->leaf_allocs[i].leaf.size_max = 0; |
900 | 0 | } else { |
901 | 0 | galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; |
902 | 0 | galloc->leaf_allocs[i].leaf.addr = hn->addr; |
903 | 0 | galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | | // reallocate buffers if needed |
908 | 0 | for (int i = 0; i < galloc->n_buffers; i++) { |
909 | | // if the buffer type is used multiple times, we reuse the same buffer |
910 | 0 | for (int j = 0; j < i; j++) { |
911 | 0 | if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { |
912 | 0 | galloc->buffers[i] = galloc->buffers[j]; |
913 | 0 | break; |
914 | 0 | } |
915 | 0 | } |
916 | | |
917 | | // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views |
918 | 0 | bool realloc = galloc->buffers[i] == NULL; |
919 | 0 | size_t new_size = 0; |
920 | 0 | for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { |
921 | 0 | size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0; |
922 | 0 | size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c); |
923 | 0 | new_size += new_chunk_size; |
924 | 0 | if (new_chunk_size > cur_chunk_size) { |
925 | 0 | realloc = true; |
926 | 0 | } |
927 | 0 | } |
928 | 0 | if (realloc) { |
929 | | #ifndef NDEBUG |
930 | | { |
931 | | size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; |
932 | | if (cur_size > 0) { |
933 | | GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", |
934 | | __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); |
935 | | } |
936 | | } |
937 | | #endif |
938 | 0 | ggml_vbuffer_free(galloc->buffers[i]); |
939 | 0 | if (no_alloc) { |
940 | 0 | galloc->buffers[i] = NULL; |
941 | 0 | } else { |
942 | 0 | galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); |
943 | 0 | if (galloc->buffers[i] == NULL) { |
944 | 0 | GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); |
945 | 0 | return false; |
946 | 0 | } |
947 | 0 | } |
948 | 0 | } |
949 | 0 | } |
950 | | |
951 | 0 | return true; |
952 | 0 | } |
953 | | |
954 | | void ggml_gallocr_reserve_n_size( |
955 | 0 | ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) { |
956 | 0 | GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true)); |
957 | 0 | for (int i = 0; i < galloc->n_buffers; i++) { |
958 | 0 | sizes[i] = 0; |
959 | 0 | for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { |
960 | 0 | sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size; |
961 | 0 | } |
962 | 0 | } |
963 | 0 | } |
964 | | |
965 | 0 | bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { |
966 | 0 | return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false); |
967 | 0 | } |
968 | | |
969 | 0 | bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { |
970 | 0 | return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); |
971 | 0 | } |
972 | | |
973 | 0 | static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { |
974 | 0 | int buffer_id = tensor_alloc->buffer_id; |
975 | 0 | assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max); |
976 | |
|
977 | 0 | if (tensor->view_src != NULL) { |
978 | 0 | if (tensor->buffer == NULL) { |
979 | 0 | assert(tensor_alloc->addr.offset == SIZE_MAX); |
980 | 0 | if (tensor->view_src->buffer == NULL) { |
981 | | // this tensor was allocated without ggml-backend |
982 | 0 | return; |
983 | 0 | } |
984 | 0 | ggml_backend_view_init(tensor); |
985 | 0 | } |
986 | 0 | } else { |
987 | 0 | if (tensor->data == NULL) { |
988 | 0 | assert(tensor_alloc->addr.offset != SIZE_MAX); |
989 | 0 | assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max); |
990 | 0 | ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr); |
991 | 0 | } else { |
992 | 0 | if (tensor->buffer == NULL) { |
993 | | // this tensor was allocated without ggml-backend |
994 | 0 | return; |
995 | 0 | } |
996 | 0 | } |
997 | 0 | } |
998 | 0 | } |
999 | | |
1000 | 0 | static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { |
1001 | 0 | size_t node_size = 0; |
1002 | 0 | if (!node->data && !node->view_src) { |
1003 | | // If we previously had data but don't now then reallocate |
1004 | 0 | if (talloc->buffer_id < 0) { |
1005 | 0 | return false; |
1006 | 0 | } |
1007 | 0 | node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); |
1008 | 0 | } |
1009 | 0 | return talloc->size_max >= node_size; |
1010 | 0 | } |
1011 | | |
1012 | 0 | static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { |
1013 | 0 | if (galloc->n_nodes != graph->n_nodes) { |
1014 | | #ifndef NDEBUG |
1015 | | GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); |
1016 | | #endif |
1017 | 0 | return true; |
1018 | 0 | } |
1019 | | |
1020 | 0 | if (galloc->n_leafs != graph->n_leafs) { |
1021 | | #ifndef NDEBUG |
1022 | | GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); |
1023 | | #endif |
1024 | 0 | return true; |
1025 | 0 | } |
1026 | | |
1027 | 0 | for (int i = 0; i < graph->n_nodes; i++) { |
1028 | 0 | struct ggml_tensor * node = graph->nodes[i]; |
1029 | 0 | struct node_alloc * node_alloc = &galloc->node_allocs[i]; |
1030 | |
|
1031 | 0 | if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { |
1032 | | #ifndef NDEBUG |
1033 | | GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); |
1034 | | #endif |
1035 | 0 | return true; |
1036 | 0 | } |
1037 | | |
1038 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
1039 | 0 | struct ggml_tensor * src = node->src[j]; |
1040 | 0 | if (src == NULL) { |
1041 | 0 | continue; |
1042 | 0 | } |
1043 | 0 | if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { |
1044 | | #ifndef NDEBUG |
1045 | | GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); |
1046 | | #endif |
1047 | 0 | return true; |
1048 | 0 | } |
1049 | 0 | } |
1050 | 0 | } |
1051 | | |
1052 | 0 | return false; |
1053 | 0 | } |
1054 | | |
1055 | 0 | bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { |
1056 | 0 | if (ggml_gallocr_needs_realloc(galloc, graph)) { |
1057 | 0 | if (galloc->n_buffers == 1) { |
1058 | | #ifndef NDEBUG |
1059 | | GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); |
1060 | | #endif |
1061 | 0 | if (!ggml_gallocr_reserve(galloc, graph)) { |
1062 | 0 | return false; |
1063 | 0 | } |
1064 | 0 | } else { |
1065 | | #ifndef NDEBUG |
1066 | | GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); |
1067 | | #endif |
1068 | 0 | return false; |
1069 | 0 | } |
1070 | 0 | } |
1071 | | |
1072 | | // reset buffers |
1073 | 0 | for (int i = 0; i < galloc->n_buffers; i++) { |
1074 | 0 | if (galloc->buffers[i] != NULL) { |
1075 | 0 | ggml_vbuffer_reset(galloc->buffers[i]); |
1076 | 0 | } |
1077 | 0 | } |
1078 | | |
1079 | | // allocate the graph tensors from the previous assignments |
1080 | | // leafs |
1081 | 0 | for (int i = 0; i < graph->n_leafs; i++) { |
1082 | 0 | struct ggml_tensor * leaf = graph->leafs[i]; |
1083 | 0 | struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; |
1084 | 0 | ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); |
1085 | 0 | } |
1086 | | // nodes |
1087 | 0 | for (int i = 0; i < graph->n_nodes; i++) { |
1088 | 0 | struct ggml_tensor * node = graph->nodes[i]; |
1089 | 0 | struct node_alloc * node_alloc = &galloc->node_allocs[i]; |
1090 | 0 | for (int j = 0; j < GGML_MAX_SRC; j++) { |
1091 | 0 | struct ggml_tensor * src = node->src[j]; |
1092 | 0 | if (src == NULL) { |
1093 | 0 | continue; |
1094 | 0 | } |
1095 | 0 | ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); |
1096 | 0 | } |
1097 | 0 | ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); |
1098 | 0 | } |
1099 | |
|
1100 | 0 | return true; |
1101 | 0 | } |
1102 | | |
1103 | 0 | size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { |
1104 | 0 | GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); |
1105 | |
|
1106 | 0 | if (galloc->buffers[buffer_id] == NULL) { |
1107 | 0 | return 0; |
1108 | 0 | } |
1109 | | |
1110 | 0 | for (int i = 0; i < buffer_id; i++) { |
1111 | 0 | if (galloc->buffers[i] == galloc->buffers[buffer_id]) { |
1112 | | // this buffer is the same as a previous one due to the same buffer type being used multiple times |
1113 | | // only return the buffer size the first time it appears to avoid double counting |
1114 | 0 | return 0; |
1115 | 0 | } |
1116 | 0 | } |
1117 | | |
1118 | 0 | return ggml_vbuffer_size(galloc->buffers[buffer_id]); |
1119 | 0 | } |
1120 | | |
1121 | | // utils |
1122 | | |
1123 | 0 | static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { |
1124 | 0 | for (size_t i = 0; i < *n_buffers; i++) { |
1125 | 0 | ggml_backend_buffer_free((*buffers)[i]); |
1126 | 0 | } |
1127 | 0 | free(*buffers); |
1128 | 0 | } |
1129 | | |
1130 | | static bool alloc_tensor_range(struct ggml_context * ctx, |
1131 | | struct ggml_tensor * first, struct ggml_tensor * last, |
1132 | | ggml_backend_buffer_type_t buft, size_t size, |
1133 | 0 | ggml_backend_buffer_t ** buffers, size_t * n_buffers) { |
1134 | |
|
1135 | 0 | ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); |
1136 | 0 | if (buffer == NULL) { |
1137 | 0 | GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); |
1138 | 0 | free_buffers(buffers, n_buffers); |
1139 | 0 | return false; |
1140 | 0 | } |
1141 | | |
1142 | 0 | *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); |
1143 | 0 | (*buffers)[(*n_buffers)++] = buffer; |
1144 | |
|
1145 | 0 | struct ggml_tallocr tallocr = ggml_tallocr_new(buffer); |
1146 | |
|
1147 | 0 | for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { |
1148 | 0 | enum ggml_status status = GGML_STATUS_SUCCESS; |
1149 | 0 | if (t->data == NULL) { |
1150 | 0 | if (t->view_src == NULL) { |
1151 | 0 | status = ggml_tallocr_alloc(&tallocr, t); |
1152 | 0 | } else if (t->buffer == NULL) { |
1153 | 0 | status = ggml_backend_view_init(t); |
1154 | 0 | } |
1155 | 0 | } else { |
1156 | 0 | if (t->view_src != NULL && t->buffer == NULL) { |
1157 | | // view of a pre-allocated tensor |
1158 | 0 | status = ggml_backend_view_init(t); |
1159 | 0 | } |
1160 | 0 | } |
1161 | 0 | if (status != GGML_STATUS_SUCCESS) { |
1162 | 0 | GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name); |
1163 | 0 | free_buffers(buffers, n_buffers); |
1164 | 0 | return false; |
1165 | 0 | } |
1166 | 0 | } |
1167 | | |
1168 | 0 | return true; |
1169 | 0 | } |
1170 | | |
1171 | | static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl( |
1172 | 0 | struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) { |
1173 | 0 | GGML_ASSERT(ggml_get_no_alloc(ctx) == true); |
1174 | |
|
1175 | 0 | size_t alignment = ggml_backend_buft_get_alignment(buft); |
1176 | 0 | size_t max_size = ggml_backend_buft_get_max_size(buft); |
1177 | |
|
1178 | 0 | ggml_backend_buffer_t * buffers = NULL; |
1179 | 0 | size_t n_buffers = 0; |
1180 | 0 | *nbytes_total = 0; |
1181 | |
|
1182 | 0 | size_t cur_buf_size = 0; |
1183 | 0 | struct ggml_tensor * first = ggml_get_first_tensor(ctx); |
1184 | 0 | for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { |
1185 | 0 | size_t this_size = 0; |
1186 | 0 | if (t->data == NULL && t->view_src == NULL) { |
1187 | 0 | this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); |
1188 | 0 | } |
1189 | |
|
1190 | 0 | if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { |
1191 | | // allocate tensors in the current buffer |
1192 | 0 | if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { |
1193 | 0 | return NULL; |
1194 | 0 | } |
1195 | 0 | first = t; |
1196 | 0 | *nbytes_total += cur_buf_size; |
1197 | 0 | cur_buf_size = this_size; |
1198 | 0 | } else { |
1199 | 0 | cur_buf_size += this_size; |
1200 | 0 | } |
1201 | 0 | } |
1202 | | |
1203 | | // allocate remaining tensors |
1204 | 0 | if (cur_buf_size > 0) { |
1205 | 0 | *nbytes_total += cur_buf_size; |
1206 | 0 | if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { |
1207 | 0 | return NULL; |
1208 | 0 | } |
1209 | 0 | } |
1210 | | |
1211 | 0 | if (no_alloc) { |
1212 | 0 | return NULL; |
1213 | 0 | } |
1214 | | |
1215 | 0 | if (n_buffers == 0) { |
1216 | | #ifndef NDEBUG |
1217 | | GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); |
1218 | | #endif |
1219 | 0 | GGML_ASSERT(!buffers); |
1220 | 0 | return NULL; |
1221 | 0 | } |
1222 | | |
1223 | 0 | ggml_backend_buffer_t buffer; |
1224 | 0 | if (n_buffers == 1) { |
1225 | 0 | buffer = buffers[0]; |
1226 | 0 | } else { |
1227 | 0 | buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); |
1228 | 0 | } |
1229 | 0 | if (buffers) { |
1230 | 0 | free(buffers); // can be NULL if context is empty or no_alloc |
1231 | 0 | } |
1232 | 0 | return buffer; |
1233 | 0 | } |
1234 | | |
1235 | 0 | size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { |
1236 | 0 | size_t nbytes_total = 0; |
1237 | 0 | ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true); |
1238 | 0 | GGML_ASSERT(!buf); |
1239 | 0 | return nbytes_total; |
1240 | 0 | } |
1241 | | |
1242 | 0 | ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { |
1243 | 0 | size_t nbytes_total = 0; |
1244 | 0 | return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false); |
1245 | 0 | } |
1246 | | |
1247 | 0 | ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { |
1248 | 0 | return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); |
1249 | 0 | } |