Coverage Report

Created: 2026-03-21 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-alloc.c
Line
Count
Source
1
#include "ggml-alloc.h"
2
#include "ggml-backend-impl.h"
3
#include "ggml.h"
4
#include "ggml-impl.h"
5
#include <assert.h>
6
#include <limits.h>
7
#include <stdarg.h>
8
#include <stdio.h>
9
#include <stdlib.h>
10
#include <string.h>
11
12
0
#define MAX(a, b) ((a) > (b) ? (a) : (b))
13
#define MAX_FREE_BLOCKS 256
14
15
//#define GGML_ALLOCATOR_DEBUG
16
17
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
#define AT_PRINTF(...)
19
20
// ops that return true for this function must not use restrict pointers for their backend implementations
21
0
bool ggml_op_can_inplace(enum ggml_op op) {
22
0
    switch (op) {
23
0
        case GGML_OP_FILL:
24
0
        case GGML_OP_SCALE:
25
0
        case GGML_OP_DIAG_MASK_ZERO:
26
0
        case GGML_OP_DIAG_MASK_INF:
27
0
        case GGML_OP_ADD:
28
0
        case GGML_OP_ADD_ID:
29
0
        case GGML_OP_ADD1:
30
0
        case GGML_OP_SUB:
31
0
        case GGML_OP_MUL:
32
0
        case GGML_OP_DIV:
33
0
        case GGML_OP_SQR:
34
0
        case GGML_OP_SQRT:
35
0
        case GGML_OP_LOG:
36
0
        case GGML_OP_UNARY:
37
0
        case GGML_OP_ROPE:
38
0
        case GGML_OP_ROPE_BACK:
39
0
        case GGML_OP_SILU_BACK:
40
0
        case GGML_OP_RMS_NORM:
41
0
        case GGML_OP_RMS_NORM_BACK:
42
0
        case GGML_OP_SOFT_MAX:
43
0
        case GGML_OP_SOFT_MAX_BACK:
44
0
            return true;
45
46
0
        default:
47
0
            return false;
48
0
    }
49
0
}
50
51
0
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
52
0
    assert(alignment && !(alignment & (alignment - 1))); // power of 2
53
0
    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
54
0
    return offset + align;
55
0
}
56
57
// tallocr
58
59
0
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
60
0
    void * base = ggml_backend_buffer_get_base(buffer);
61
0
    size_t align = ggml_backend_buffer_get_alignment(buffer);
62
63
0
    assert(align && !(align & (align - 1))); // power of 2
64
65
0
    struct ggml_tallocr talloc = (struct ggml_tallocr) {
66
0
        /*.buffer    = */ buffer,
67
0
        /*.base      = */ base,
68
0
        /*.alignment = */ align,
69
0
        /*.offset    = */ aligned_offset(base, 0, align),
70
0
    };
71
0
    return talloc;
72
0
}
73
74
0
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
75
0
    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
76
0
    size = GGML_PAD(size, talloc->alignment);
77
78
0
    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
79
0
        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
80
0
                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
81
0
        GGML_ABORT("not enough space in the buffer");
82
0
    }
83
84
0
    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
85
0
    talloc->offset += size;
86
87
0
    assert(((uintptr_t)addr % talloc->alignment) == 0);
88
89
0
    return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
90
0
}
91
92
// dynamic tensor allocator
93
94
0
#define GGML_VBUFFER_MAX_CHUNKS 16
95
96
// relative memory address within an allocation that can be split into multiple buffers (chunks)
97
struct buffer_address {
98
    int chunk;     // index of a backend buffer
99
    size_t offset; // local memory offset within the buffer
100
};
101
102
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
103
104
0
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
105
0
    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
106
0
}
107
108
struct free_block {
109
    size_t offset;
110
    size_t size;
111
};
112
113
struct tallocr_chunk {
114
    struct free_block free_blocks[MAX_FREE_BLOCKS];
115
    int n_free_blocks;
116
    size_t max_size;
117
};
118
119
struct ggml_dyn_tallocr {
120
    size_t alignment;
121
    size_t max_chunk_size;
122
    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
123
    int n_chunks;
124
125
#ifdef GGML_ALLOCATOR_DEBUG
126
    struct {
127
        const struct ggml_tensor * tensor;
128
        struct buffer_address addr;
129
    } allocated_tensors[1024];
130
#endif
131
};
132
133
0
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
134
0
    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
135
    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
136
0
    int insert_pos = 0;
137
0
    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
138
0
        insert_pos++;
139
0
    }
140
    // shift all blocks from insert_pos onward to make room for the new block
141
0
    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
142
0
        chunk->free_blocks[i] = chunk->free_blocks[i-1];
143
0
    }
144
    // insert the new block
145
0
    chunk->free_blocks[insert_pos].offset = offset;
146
0
    chunk->free_blocks[insert_pos].size = size;
147
0
    chunk->n_free_blocks++;
148
0
}
149
150
0
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
151
    // shift all elements after idx by 1 to the left, overwriting the element at idx
152
0
    for (int i = idx; i < chunk->n_free_blocks; i++) {
153
0
        chunk->free_blocks[i] = chunk->free_blocks[i+1];
154
0
    }
155
0
    chunk->n_free_blocks--;
156
0
}
157
158
0
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
159
0
    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
160
0
        return -1;
161
0
    }
162
0
    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
163
0
    chunk->n_free_blocks = 1;
164
0
    chunk->free_blocks[0].offset = 0;
165
    // available space in a chunk is limited to max_chunk_size, but can be higher if:
166
    // 1. a single tensor exceeds the maximum, and cannot fit any other way
167
    // 2. we are running out of chunks
168
    // backends will either manage to allocate the larger size, or report an error.
169
0
    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
170
0
    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
171
0
        chunk->free_blocks[0].size = SIZE_MAX/2;
172
0
    }
173
0
    alloc->chunks[alloc->n_chunks] = chunk;
174
0
    alloc->n_chunks++;
175
0
    return alloc->n_chunks - 1;
176
0
}
177
178
#ifdef GGML_ALLOCATOR_DEBUG
179
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
180
    for (int i = 0; i < 1024; i++) {
181
        if (alloc->allocated_tensors[i].tensor == NULL) {
182
            alloc->allocated_tensors[i].tensor = tensor;
183
            alloc->allocated_tensors[i].addr = addr;
184
            return;
185
        }
186
    }
187
    GGML_ABORT("out of allocated_tensors");
188
}
189
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
190
    for (int i = 0; i < 1024; i++) {
191
        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
192
            alloc->allocated_tensors[i].tensor = NULL;
193
            return;
194
        }
195
    }
196
    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
197
}
198
#endif
199
200
0
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
201
0
    size = aligned_offset(NULL, size, alloc->alignment);
202
203
0
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
204
205
0
    int best_fit_chunk = -1;
206
0
    int best_fit_block = -1;
207
0
    size_t max_avail = 0;
208
209
    // find the best fitting free block besides the last block, within any chunk
210
0
    for (int c = 0; c < alloc->n_chunks; ++c) {
211
0
        struct tallocr_chunk * chunk = alloc->chunks[c];
212
0
        size_t best_fit_size = SIZE_MAX;
213
0
        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
214
0
            struct free_block * block = &chunk->free_blocks[i];
215
0
            max_avail = MAX(max_avail, block->size);
216
0
            if (block->size >= size && block->size <= best_fit_size) {
217
0
                best_fit_chunk = c;
218
0
                best_fit_block = i;
219
0
                best_fit_size = block->size;
220
0
            }
221
0
        }
222
0
    }
223
224
0
    if (best_fit_block == -1) {
225
        // no suitable block found, try the last block (this may grow a chunks size)
226
0
        int64_t best_reuse = INT64_MIN;
227
0
        for (int c = 0; c < alloc->n_chunks; ++c) {
228
0
            struct tallocr_chunk * chunk = alloc->chunks[c];
229
0
            if (chunk->n_free_blocks > 0) {
230
0
                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
231
0
                max_avail = MAX(max_avail, block->size);
232
0
                int64_t reuse_factor = chunk->max_size - block->offset - size;
233
                // reuse_factor < 0 : amount of extra memory that needs to be allocated
234
                // reuse_factor = 0 : allocated free space exactly matches tensor size
235
                // reuse_factor > 0 : superfluous memory that will remain unused
236
0
                bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
237
0
                bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
238
0
                if (block->size >= size && (better_reuse || better_fit)) {
239
0
                    best_fit_chunk = c;
240
0
                    best_fit_block = chunk->n_free_blocks - 1;
241
0
                    best_reuse = reuse_factor;
242
0
                }
243
0
            }
244
0
        }
245
0
    }
246
247
0
    if (best_fit_block == -1) {
248
        // none of the existing chunks have enough space left
249
0
        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
250
0
        best_fit_block = 0;
251
0
    }
252
0
    if (best_fit_chunk == -1) {
253
        // since the last chunk always has virtually endless memory, this should never happen
254
0
        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
255
0
            __func__, size, max_avail);
256
0
        GGML_ABORT("graph allocation: failed to reserve memory");
257
0
    }
258
259
0
    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
260
0
    struct free_block    * block = &chunk->free_blocks[best_fit_block];
261
0
    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
262
0
    block->offset += size;
263
0
    block->size -= size;
264
0
    if (block->size == 0) {
265
        // remove block if empty
266
0
        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
267
0
    }
268
269
0
    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
270
271
#ifdef GGML_ALLOCATOR_DEBUG
272
    add_allocated_tensor(alloc, addr, tensor);
273
    size_t cur_max = addr.offset + size;
274
    if (cur_max > chunk->max_size) {
275
        // sort allocated_tensors by chunk/offset
276
        for (int i = 0; i < 1024; i++) {
277
            for (int j = i + 1; j < 1024; j++) {
278
                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
279
                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
280
                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
281
                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
282
                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
283
                    alloc->allocated_tensors[j].tensor = tmp_tensor;
284
                    alloc->allocated_tensors[j].addr = tmp_addr;
285
                }
286
            }
287
        }
288
        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
289
        for (int i = 0; i < 1024; i++) {
290
            if (alloc->allocated_tensors[i].tensor) {
291
                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
292
                    alloc->allocated_tensors[i].addr.chunk,
293
                    alloc->allocated_tensors[i].addr.offset,
294
                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
295
                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
296
            }
297
        }
298
        GGML_LOG_DEBUG("\n");
299
    }
300
#endif
301
302
0
    chunk->max_size = MAX(chunk->max_size, addr.offset + size);
303
304
0
    return addr;
305
306
0
    GGML_UNUSED(tensor);
307
0
}
308
309
// this is a very naive implementation, but for our case the number of free blocks should be very small
310
0
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
311
0
    size = aligned_offset(NULL, size, alloc->alignment);
312
313
0
    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
314
315
    // see if we can merge with an existing block
316
0
    for (int i = 0; i < chunk->n_free_blocks; i++) {
317
0
        struct free_block * block = &chunk->free_blocks[i];
318
        // check if ptr is at the end of the block
319
0
        if (block->offset + block->size == addr.offset) {
320
0
            block->size += size;
321
            // check if we can merge with the next block
322
0
            if (i < chunk->n_free_blocks - 1) {
323
0
                struct free_block * next = &chunk->free_blocks[i+1];
324
0
                if (block->offset + block->size == next->offset) {
325
0
                    block->size += next->size;
326
0
                    ggml_dyn_tallocr_remove_block(chunk, i+1);
327
0
                }
328
0
            }
329
0
            return;
330
0
        }
331
        // check if ptr is at the beginning of the block
332
0
        if (addr.offset + size == block->offset) {
333
0
            block->offset = addr.offset;
334
0
            block->size += size;
335
            // check if we can merge with the previous block
336
0
            if (i > 0) {
337
0
                struct free_block * prev = &chunk->free_blocks[i-1];
338
0
                if (prev->offset + prev->size == block->offset) {
339
0
                    prev->size += block->size;
340
0
                    ggml_dyn_tallocr_remove_block(chunk, i);
341
0
                }
342
0
            }
343
0
            return;
344
0
        }
345
0
    }
346
    // otherwise, add a new block
347
0
    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
348
0
}
349
350
0
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
351
0
    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
352
0
        free(alloc->chunks[i]);
353
0
        alloc->chunks[i] = NULL;
354
0
    }
355
0
    alloc->n_chunks = 0;
356
357
#ifdef GGML_ALLOCATOR_DEBUG
358
    for (int i = 0; i < 1024; i++) {
359
        alloc->allocated_tensors[i].tensor = NULL;
360
    }
361
#endif
362
0
}
363
364
0
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
365
0
    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
366
367
0
    *alloc = (struct ggml_dyn_tallocr) {
368
0
        /*.alignment      = */ alignment,
369
0
        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
370
0
        /*.chunks         = */ {NULL},
371
0
        /*.n_chunks       = */ 0,
372
#ifdef GGML_ALLOCATOR_DEBUG
373
        /*.allocated_tensors = */ {{0}},
374
#endif
375
0
    };
376
377
0
    ggml_dyn_tallocr_reset(alloc);
378
379
0
    return alloc;
380
0
}
381
382
0
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
383
0
    for (int i = 0; i < alloc->n_chunks; ++i) {
384
0
        free(alloc->chunks[i]);
385
0
    }
386
0
    free(alloc);
387
0
}
388
389
0
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
390
0
    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
391
0
}
392
393
394
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
395
396
struct vbuffer {
397
    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
398
};
399
400
0
static void ggml_vbuffer_free(struct vbuffer * buf) {
401
0
    if (buf == NULL) {
402
0
        return;
403
0
    }
404
0
    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
405
0
        ggml_backend_buffer_free(buf->chunks[i]);
406
0
    }
407
0
    free(buf);
408
0
}
409
410
0
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
411
0
    return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
412
0
}
413
414
0
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
415
0
    size_t size = 0;
416
0
    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
417
0
        size += ggml_backend_buffer_get_size(buf->chunks[i]);
418
0
    }
419
0
    return size;
420
0
}
421
422
0
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
423
0
    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
424
0
    if (buf == NULL) {
425
0
        return NULL;
426
0
    }
427
428
0
    for (int n = 0; n < talloc->n_chunks; n++) {
429
0
        size_t chunk_size = talloc->chunks[n]->max_size;
430
0
        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
431
0
        if (buf->chunks[n] == NULL) {
432
0
            ggml_vbuffer_free(buf);
433
0
            return NULL;
434
0
        }
435
0
        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
436
0
    }
437
0
    return buf;
438
0
}
439
440
0
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
441
0
    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
442
0
    void * addr = (char *)base + buf_addr.offset;
443
0
    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
444
0
}
445
446
0
static void ggml_vbuffer_reset(struct vbuffer * buf) {
447
0
    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
448
0
        ggml_backend_buffer_reset(buf->chunks[i]);
449
0
    }
450
0
}
451
452
453
/////////////////////////////////////
454
455
// graph allocator
456
457
struct hash_node {
458
    int n_children;
459
    int n_views;
460
    int buffer_id;
461
    struct buffer_address addr;
462
    bool allocated;
463
};
464
465
struct tensor_alloc {
466
    int buffer_id;
467
    struct buffer_address addr;
468
    size_t size_max; // 0 = pre-allocated, unused, or view
469
};
470
471
struct leaf_alloc {
472
    struct tensor_alloc leaf;
473
};
474
475
struct node_alloc {
476
    struct tensor_alloc dst;
477
    struct tensor_alloc src[GGML_MAX_SRC];
478
};
479
480
struct ggml_gallocr {
481
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
482
    struct vbuffer ** buffers; // [n_buffers]
483
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
484
    int n_buffers;
485
486
    struct ggml_hash_set hash_set;
487
    struct hash_node * hash_values; // [hash_set.size]
488
489
    struct node_alloc * node_allocs; // [n_nodes]
490
    int n_nodes;
491
492
    struct leaf_alloc * leaf_allocs; // [n_leafs]
493
    int n_leafs;
494
};
495
496
0
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
497
0
    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
498
0
    GGML_ASSERT(galloc != NULL);
499
500
0
    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
501
0
    GGML_ASSERT(galloc->bufts != NULL);
502
503
0
    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
504
0
    GGML_ASSERT(galloc->buffers != NULL);
505
506
0
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
507
0
    GGML_ASSERT(galloc->buf_tallocs != NULL);
508
509
0
    for (int i = 0; i < n_bufs; i++) {
510
0
        galloc->bufts[i] = bufts[i];
511
0
        galloc->buffers[i] = NULL;
512
513
        // check if the same buffer type is used multiple times and reuse the same allocator
514
0
        for (int j = 0; j < i; j++) {
515
0
            if (bufts[i] == bufts[j]) {
516
0
                galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
517
0
                break;
518
0
            }
519
0
        }
520
521
0
        if (galloc->buf_tallocs[i] == NULL) {
522
0
            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
523
0
            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
524
0
            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
525
0
        }
526
0
    }
527
0
    galloc->n_buffers = n_bufs;
528
529
0
    return galloc;
530
0
}
531
532
0
ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
533
0
    return ggml_gallocr_new_n(&buft, 1);
534
0
}
535
536
0
void ggml_gallocr_free(ggml_gallocr_t galloc) {
537
0
    if (galloc == NULL) {
538
0
        return;
539
0
    }
540
541
0
    for (int i = 0; i < galloc->n_buffers; i++) {
542
0
        if (galloc->buffers != NULL) {
543
            // skip if already freed
544
0
            bool freed = false;
545
0
            for (int j = 0; j < i; j++) {
546
0
                if (galloc->buffers[j] == galloc->buffers[i]) {
547
0
                    freed = true;
548
0
                    break;
549
0
                }
550
0
            }
551
0
            if (!freed) {
552
0
                ggml_vbuffer_free(galloc->buffers[i]);
553
0
            }
554
0
        }
555
0
        if (galloc->buf_tallocs != NULL) {
556
            // skip if already freed
557
0
            bool freed = false;
558
0
            for (int j = 0; j < i; j++) {
559
0
                if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
560
0
                    freed = true;
561
0
                    break;
562
0
                }
563
0
            }
564
0
            if (!freed) {
565
0
                ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
566
0
            }
567
0
        }
568
0
    }
569
570
0
    ggml_hash_set_free(&galloc->hash_set);
571
0
    free(galloc->hash_values);
572
0
    free(galloc->bufts);
573
0
    free(galloc->buffers);
574
0
    free(galloc->buf_tallocs);
575
0
    free(galloc->node_allocs);
576
0
    free(galloc->leaf_allocs);
577
0
    free(galloc);
578
0
}
579
580
typedef struct ggml_gallocr * ggml_gallocr_t;
581
582
0
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
583
0
    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
584
0
    return &galloc->hash_values[i];
585
0
}
586
587
0
static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
588
0
    return ggml_gallocr_hash_get(galloc, t)->allocated;
589
0
}
590
591
0
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
592
0
    return t->data != NULL // tensor data already set externally
593
0
        || t->buffer // tensor on external buffer (but not yet allocated)
594
0
        || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
595
0
}
596
597
// free the extra space at the end if the new tensor is smaller
598
0
static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
599
0
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
600
0
    struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
601
602
0
    size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
603
0
    size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
604
605
0
    GGML_ASSERT(parent_size >= node_size);
606
607
    // note: we want after the freeing the chunks to continue to be aligned
608
0
    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
609
0
    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
610
0
    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
611
612
0
    if (parent_size > node_size) {
613
0
        struct buffer_address p_addr = p_hn->addr;
614
0
        p_addr.offset += node_size;
615
0
        size_t extra_size = parent_size - node_size;
616
0
        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
617
0
        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
618
0
    }
619
0
}
620
621
0
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
622
0
    GGML_ASSERT(buffer_id >= 0);
623
0
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
624
625
0
    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_impl_is_view(node)) {
626
0
        hn->allocated = true;
627
0
        assert(hn->addr.offset == 0);
628
629
        // try to reuse a parent's buffer (inplace)
630
0
        if (ggml_op_can_inplace(node->op)) {
631
0
            for (int i = 0; i < GGML_MAX_SRC; i++) {
632
0
                struct ggml_tensor * parent = node->src[i];
633
0
                if (parent == NULL) {
634
0
                    continue;
635
0
                }
636
637
                // if the node's data is external, then we cannot re-use it
638
0
                if (!ggml_gallocr_is_own(galloc, parent)) {
639
0
                    AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
640
0
                    continue;
641
0
                }
642
643
                // outputs cannot be reused
644
0
                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
645
0
                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
646
0
                    continue;
647
0
                }
648
649
0
                if (!ggml_are_same_layout(node, parent)) {
650
0
                    AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
651
0
                    continue;
652
0
                }
653
654
0
                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
655
0
                if (p_hn->n_children == 1 && p_hn->n_views == 0) {
656
0
                    if (ggml_impl_is_view(parent)) {
657
0
                        struct ggml_tensor * view_src = parent->view_src;
658
0
                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
659
0
                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
660
0
                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
661
0
                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
662
0
                            hn->buffer_id = p_hn->buffer_id;
663
0
                            hn->addr = p_hn->addr;
664
0
                            p_hn->allocated = false; // avoid freeing the parent
665
0
                            view_src_hn->allocated = false;
666
0
                            ggml_gallocr_free_extra_space(galloc, node, view_src);
667
0
                            return;
668
0
                        }
669
0
                    } else {
670
0
                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
671
0
                        hn->buffer_id = p_hn->buffer_id;
672
0
                        hn->addr = p_hn->addr;
673
0
                        p_hn->allocated = false; // avoid freeing the parent
674
0
                        ggml_gallocr_free_extra_space(galloc, node, parent);
675
0
                        return;
676
0
                    }
677
0
                }
678
0
            }
679
0
        }
680
        // allocate tensor from the buffer
681
0
        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
682
0
        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
683
0
        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
684
0
        hn->buffer_id = buffer_id;
685
0
        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
686
0
    }
687
0
}
688
689
0
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
690
    // graph outputs are never freed
691
0
    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
692
0
        AT_PRINTF("not freeing output %s\n", node->name);
693
0
        return;
694
0
    }
695
696
0
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
697
0
    int buffer_id = hn->buffer_id;
698
0
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
699
0
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
700
0
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
701
702
0
    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
703
0
        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
704
#ifdef GGML_ALLOCATOR_DEBUG
705
    remove_allocated_tensor(alloc, hn->addr, node);
706
#endif
707
708
0
    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
709
0
    hn->allocated = false;
710
0
}
711
712
0
static int get_node_buffer_id(const int * node_buffer_ids, int i) {
713
0
    return node_buffer_ids ? node_buffer_ids[i] : 0;
714
0
}
715
716
0
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
717
    // clear hash tables
718
0
    ggml_hash_set_reset(&galloc->hash_set);
719
0
    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
720
721
    // allocate leafs
722
    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
723
0
    for (int i = 0; i < graph->n_leafs; i++) {
724
0
        struct ggml_tensor * leaf = graph->leafs[i];
725
0
        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
726
0
    }
727
728
    // count number of children and views
729
    // allocate other graph inputs and leafs first to avoid overwriting them
730
0
    for (int i = 0; i < graph->n_nodes; i++) {
731
0
        struct ggml_tensor * node = graph->nodes[i];
732
733
        // TODO: better way to add external dependencies
734
        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
735
        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
736
        // itself is never used and should not be considered a dependency
737
0
        if (ggml_impl_is_view(node) && node->op != GGML_OP_NONE) {
738
0
            struct ggml_tensor * view_src = node->view_src;
739
0
            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
740
0
        }
741
742
0
        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
743
0
            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
744
0
        }
745
746
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
747
0
            struct ggml_tensor * src = node->src[j];
748
0
            if (src == NULL) {
749
0
                continue;
750
0
            }
751
752
0
            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
753
754
            // allocate explicit inputs
755
0
            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
756
0
                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
757
0
            }
758
0
        }
759
0
    }
760
761
    // allocate tensors
762
0
    for (int i = 0; i < graph->n_nodes; i++) {
763
0
        struct ggml_tensor * node = graph->nodes[i];
764
0
        int buffer_id = get_node_buffer_id(node_buffer_ids, i);
765
766
        // allocate parents (only leafs need to be allocated at this point)
767
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
768
0
            struct ggml_tensor * parent = node->src[j];
769
0
            if (parent == NULL) {
770
0
                continue;
771
0
            }
772
0
            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
773
0
        }
774
775
        // allocate node
776
0
        ggml_gallocr_allocate_node(galloc, node, buffer_id);
777
778
0
        AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
779
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
780
0
            struct ggml_tensor * parent = node->src[j];
781
0
            if (parent == NULL) {
782
0
                continue;
783
0
            }
784
0
            AT_PRINTF("%s", parent->name);
785
0
            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
786
0
                AT_PRINTF(", ");
787
0
            }
788
0
        }
789
0
        AT_PRINTF("\n");
790
791
        // update parents
792
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
793
0
            struct ggml_tensor * parent = node->src[j];
794
0
            if (parent == NULL) {
795
0
                continue;
796
0
            }
797
0
            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
798
0
            p_hn->n_children -= 1;
799
800
0
            AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
801
0
                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
802
803
0
            if (p_hn->n_children == 0 && p_hn->n_views == 0) {
804
0
                if (ggml_impl_is_view(parent)) {
805
0
                    struct ggml_tensor * view_src = parent->view_src;
806
0
                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
807
0
                    view_src_hn->n_views -= 1;
808
0
                    AT_PRINTF("view_src %s: %d children, %d views\n",
809
0
                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);
810
0
                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
811
0
                        ggml_gallocr_free_node(galloc, view_src);
812
0
                    }
813
0
                }
814
0
                else if (p_hn->allocated) {
815
0
                    ggml_gallocr_free_node(galloc, parent);
816
0
                }
817
0
            }
818
0
            AT_PRINTF("\n");
819
0
        }
820
0
    }
821
0
}
822
823
static bool ggml_gallocr_reserve_n_impl(
824
0
        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
825
0
    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
826
    // add 25% margin to avoid hash collisions
827
0
    min_hash_size += min_hash_size / 4;
828
829
    // initialize hash table
830
0
    if (galloc->hash_set.size < min_hash_size) {
831
0
        ggml_hash_set_free(&galloc->hash_set);
832
0
        galloc->hash_set = ggml_hash_set_new(min_hash_size);
833
0
        GGML_ASSERT(galloc->hash_set.keys != NULL);
834
835
0
        free(galloc->hash_values);
836
0
        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
837
0
        GGML_ASSERT(galloc->hash_values != NULL);
838
0
    }
839
840
    // reset allocators
841
0
    for (int i = 0; i < galloc->n_buffers; i++) {
842
0
        ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
843
0
    }
844
845
    // allocate in hash table
846
0
    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
847
848
    // set the node_allocs from the hash table
849
0
    if (galloc->n_nodes < graph->n_nodes) {
850
0
        free(galloc->node_allocs);
851
0
        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
852
0
        GGML_ASSERT(galloc->node_allocs != NULL);
853
0
    }
854
0
    galloc->n_nodes = graph->n_nodes;
855
0
    for (int i = 0; i < graph->n_nodes; i++) {
856
0
        struct ggml_tensor * node = graph->nodes[i];
857
0
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
858
0
        if (node->view_src || node->data) {
859
0
            node_alloc->dst.buffer_id = -1;
860
0
            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
861
0
            node_alloc->dst.size_max = 0;
862
0
        } else {
863
0
            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
864
0
            node_alloc->dst.buffer_id = hn->buffer_id;
865
0
            node_alloc->dst.addr = hn->addr;
866
0
            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
867
0
        }
868
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
869
0
            struct ggml_tensor * src = node->src[j];
870
0
            if (!src || src->view_src || src->data) {
871
0
                node_alloc->src[j].buffer_id = -1;
872
0
                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
873
0
                node_alloc->src[j].size_max = 0;
874
0
            } else {
875
0
                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
876
0
                node_alloc->src[j].buffer_id = hn->buffer_id;
877
0
                node_alloc->src[j].addr = hn->addr;
878
0
                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
879
0
            }
880
0
        }
881
0
    }
882
0
    if (galloc->n_leafs < graph->n_leafs) {
883
0
        free(galloc->leaf_allocs);
884
0
        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
885
0
        GGML_ASSERT(galloc->leaf_allocs != NULL);
886
0
    }
887
0
    galloc->n_leafs = graph->n_leafs;
888
0
    for (int i = 0; i < graph->n_leafs; i++) {
889
0
        struct ggml_tensor * leaf = graph->leafs[i];
890
0
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
891
0
        if (leaf->view_src || leaf->data) {
892
0
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
893
0
            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
894
0
            galloc->leaf_allocs[i].leaf.size_max = 0;
895
0
        } else {
896
0
            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
897
0
            galloc->leaf_allocs[i].leaf.addr = hn->addr;
898
0
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
899
0
        }
900
0
    }
901
902
    // reallocate buffers if needed
903
0
    for (int i = 0; i < galloc->n_buffers; i++) {
904
        // if the buffer type is used multiple times, we reuse the same buffer
905
0
        for (int j = 0; j < i; j++) {
906
0
            if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
907
0
                galloc->buffers[i] = galloc->buffers[j];
908
0
                break;
909
0
            }
910
0
        }
911
912
        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
913
0
        bool realloc = galloc->buffers[i] == NULL;
914
0
        size_t new_size = 0;
915
0
        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
916
0
            size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
917
0
            size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
918
0
            new_size += new_chunk_size;
919
0
            if (new_chunk_size > cur_chunk_size) {
920
0
                realloc = true;
921
0
            }
922
0
        }
923
0
        if (realloc) {
924
#ifndef NDEBUG
925
            {
926
                size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
927
                if (cur_size > 0) {
928
                    GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
929
                        __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
930
                }
931
            }
932
#endif
933
0
            ggml_vbuffer_free(galloc->buffers[i]);
934
0
            if (no_alloc) {
935
0
                galloc->buffers[i] = NULL;
936
0
            } else {
937
0
                galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
938
0
                if (galloc->buffers[i] == NULL) {
939
0
                    GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
940
0
                    return false;
941
0
                }
942
0
            }
943
0
        }
944
0
    }
945
946
0
    return true;
947
0
}
948
949
void ggml_gallocr_reserve_n_size(
950
0
        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
951
0
    GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
952
0
    for (int i = 0; i < galloc->n_buffers; i++) {
953
0
        sizes[i] = 0;
954
0
        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
955
0
            sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
956
0
        }
957
0
    }
958
0
}
959
960
0
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
961
0
    return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
962
0
}
963
964
0
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
965
0
    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
966
0
}
967
968
0
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
969
0
    int buffer_id = tensor_alloc->buffer_id;
970
0
    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
971
972
0
    if (tensor->view_src != NULL) {
973
0
        if (tensor->buffer == NULL) {
974
0
            assert(tensor_alloc->addr.offset == SIZE_MAX);
975
0
            if (tensor->view_src->buffer == NULL) {
976
                // this tensor was allocated without ggml-backend
977
0
                return;
978
0
            }
979
0
            ggml_backend_view_init(tensor);
980
0
        }
981
0
    } else {
982
0
        if (tensor->data == NULL) {
983
0
            assert(tensor_alloc->addr.offset != SIZE_MAX);
984
0
            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
985
0
            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
986
0
        } else {
987
0
            if (tensor->buffer == NULL) {
988
                // this tensor was allocated without ggml-backend
989
0
                return;
990
0
            }
991
0
        }
992
0
    }
993
0
}
994
995
0
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
996
0
    size_t node_size = 0;
997
0
    if (!node->data && !node->view_src) {
998
        // If we previously had data but don't now then reallocate
999
0
        if (talloc->buffer_id < 0) {
1000
0
            return false;
1001
0
        }
1002
0
        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
1003
0
    }
1004
0
    return talloc->size_max >= node_size;
1005
0
}
1006
1007
0
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
1008
0
    if (galloc->n_nodes != graph->n_nodes) {
1009
#ifndef NDEBUG
1010
        GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
1011
#endif
1012
0
        return true;
1013
0
    }
1014
1015
0
    if (galloc->n_leafs != graph->n_leafs) {
1016
#ifndef NDEBUG
1017
        GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
1018
#endif
1019
0
        return true;
1020
0
    }
1021
1022
0
    for (int i = 0; i < graph->n_nodes; i++) {
1023
0
        struct ggml_tensor * node = graph->nodes[i];
1024
0
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
1025
1026
0
        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
1027
#ifndef NDEBUG
1028
            GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
1029
#endif
1030
0
            return true;
1031
0
        }
1032
1033
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
1034
0
            struct ggml_tensor * src = node->src[j];
1035
0
            if (src == NULL) {
1036
0
                continue;
1037
0
            }
1038
0
            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
1039
#ifndef NDEBUG
1040
                GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
1041
#endif
1042
0
                return true;
1043
0
            }
1044
0
        }
1045
0
    }
1046
1047
0
    return false;
1048
0
}
1049
1050
0
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
1051
0
    if (ggml_gallocr_needs_realloc(galloc, graph)) {
1052
0
        if (galloc->n_buffers == 1) {
1053
#ifndef NDEBUG
1054
            GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
1055
#endif
1056
0
            if (!ggml_gallocr_reserve(galloc, graph)) {
1057
0
                return false;
1058
0
            }
1059
0
        } else {
1060
#ifndef NDEBUG
1061
            GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
1062
#endif
1063
0
            return false;
1064
0
        }
1065
0
    }
1066
1067
    // reset buffers
1068
0
    for (int i = 0; i < galloc->n_buffers; i++) {
1069
0
        if (galloc->buffers[i] != NULL) {
1070
0
            ggml_vbuffer_reset(galloc->buffers[i]);
1071
0
        }
1072
0
    }
1073
1074
    // allocate the graph tensors from the previous assignments
1075
    // leafs
1076
0
    for (int i = 0; i < graph->n_leafs; i++) {
1077
0
        struct ggml_tensor * leaf = graph->leafs[i];
1078
0
        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
1079
0
        ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
1080
0
    }
1081
    // nodes
1082
0
    for (int i = 0; i < graph->n_nodes; i++) {
1083
0
        struct ggml_tensor * node = graph->nodes[i];
1084
0
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
1085
0
        for (int j = 0; j < GGML_MAX_SRC; j++) {
1086
0
            struct ggml_tensor * src = node->src[j];
1087
0
            if (src == NULL) {
1088
0
                continue;
1089
0
            }
1090
0
            ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
1091
0
        }
1092
0
        ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
1093
0
    }
1094
1095
0
    return true;
1096
0
}
1097
1098
0
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
1099
0
    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
1100
1101
0
    if (galloc->buffers[buffer_id] == NULL) {
1102
0
        return 0;
1103
0
    }
1104
1105
0
    for (int i = 0; i < buffer_id; i++) {
1106
0
        if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
1107
            // this buffer is the same as a previous one due to the same buffer type being used multiple times
1108
            // only return the buffer size the first time it appears to avoid double counting
1109
0
            return 0;
1110
0
        }
1111
0
    }
1112
1113
0
    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
1114
0
}
1115
1116
// utils
1117
1118
0
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
1119
0
    for (size_t i = 0; i < *n_buffers; i++) {
1120
0
        ggml_backend_buffer_free((*buffers)[i]);
1121
0
    }
1122
0
    free(*buffers);
1123
0
}
1124
1125
static bool alloc_tensor_range(struct ggml_context * ctx,
1126
        struct ggml_tensor * first, struct ggml_tensor * last,
1127
        ggml_backend_buffer_type_t buft, size_t size,
1128
0
        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
1129
1130
0
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
1131
0
    if (buffer == NULL) {
1132
0
        GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
1133
0
        free_buffers(buffers, n_buffers);
1134
0
        return false;
1135
0
    }
1136
1137
0
    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
1138
0
    (*buffers)[(*n_buffers)++] = buffer;
1139
1140
0
    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
1141
1142
0
    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
1143
0
        enum ggml_status status = GGML_STATUS_SUCCESS;
1144
0
        if (t->data == NULL) {
1145
0
            if (t->view_src == NULL) {
1146
0
                status = ggml_tallocr_alloc(&tallocr, t);
1147
0
            } else if (t->buffer == NULL) {
1148
0
                status = ggml_backend_view_init(t);
1149
0
            }
1150
0
        } else {
1151
0
            if (t->view_src != NULL && t->buffer == NULL) {
1152
                // view of a pre-allocated tensor
1153
0
                status = ggml_backend_view_init(t);
1154
0
            }
1155
0
        }
1156
0
        if (status != GGML_STATUS_SUCCESS) {
1157
0
            GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
1158
0
            free_buffers(buffers, n_buffers);
1159
0
            return false;
1160
0
        }
1161
0
    }
1162
1163
0
    return true;
1164
0
}
1165
1166
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
1167
0
        struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
1168
0
    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
1169
1170
0
    size_t alignment = ggml_backend_buft_get_alignment(buft);
1171
0
    size_t max_size = ggml_backend_buft_get_max_size(buft);
1172
1173
0
    ggml_backend_buffer_t * buffers = NULL;
1174
0
    size_t n_buffers = 0;
1175
0
    *nbytes_total = 0;
1176
1177
0
    size_t cur_buf_size = 0;
1178
0
    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
1179
0
    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1180
0
        size_t this_size = 0;
1181
0
        if (t->data == NULL && t->view_src == NULL) {
1182
0
            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
1183
0
        }
1184
1185
0
        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
1186
            // allocate tensors in the current buffer
1187
0
            if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1188
0
                return NULL;
1189
0
            }
1190
0
            first = t;
1191
0
            *nbytes_total += cur_buf_size;
1192
0
            cur_buf_size = this_size;
1193
0
        } else {
1194
0
            cur_buf_size += this_size;
1195
0
        }
1196
0
    }
1197
1198
    // allocate remaining tensors
1199
0
    if (cur_buf_size > 0) {
1200
0
        *nbytes_total += cur_buf_size;
1201
0
        if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1202
0
            return NULL;
1203
0
        }
1204
0
    }
1205
1206
0
    if (no_alloc) {
1207
0
        return NULL;
1208
0
    }
1209
1210
0
    if (n_buffers == 0) {
1211
#ifndef NDEBUG
1212
        GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1213
#endif
1214
0
        GGML_ASSERT(!buffers);
1215
0
        return NULL;
1216
0
    }
1217
1218
0
    ggml_backend_buffer_t buffer;
1219
0
    if (n_buffers == 1) {
1220
0
        buffer = buffers[0];
1221
0
    } else {
1222
0
        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1223
0
    }
1224
0
    if (buffers) {
1225
0
        free(buffers); // can be NULL if context is empty or no_alloc
1226
0
    }
1227
0
    return buffer;
1228
0
}
1229
1230
0
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1231
0
    size_t nbytes_total = 0;
1232
0
    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
1233
0
    GGML_ASSERT(!buf);
1234
0
    return nbytes_total;
1235
0
}
1236
1237
0
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1238
0
    size_t nbytes_total = 0;
1239
0
    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
1240
0
}
1241
1242
0
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1243
0
    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1244
0
}