Coverage Report

Created: 2026-06-13 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama-graph.h
Line
Count
Source
1
#pragma once
2
3
#include "llama-arch.h"
4
#include "llama-batch.h"
5
#include "llama-hparams.h"
6
#include "llama-adapter.h"
7
8
#include <cstdint>
9
#include <vector>
10
#include <memory>
11
#include <set>
12
#include <functional>
13
#include <map>
14
15
struct ggml_cgraph;
16
struct ggml_context;
17
struct ggml_tensor;
18
19
struct llama_cparams;
20
struct llama_layer;
21
22
struct llama_memory_context_i;
23
24
class llama_kv_cache_context;
25
class llama_kv_cache_dsa_context;
26
class llama_kv_cache_iswa_context;
27
class llama_memory_recurrent_context;
28
class llama_memory_hybrid_context;
29
class llama_memory_hybrid_iswa_context;
30
31
// certain models (typically multi-modal) can produce different types of graphs
32
enum llm_graph_type {
33
    LLM_GRAPH_TYPE_DEFAULT,
34
    LLM_GRAPH_TYPE_ENCODER,
35
    LLM_GRAPH_TYPE_DECODER,
36
    LLM_GRAPH_TYPE_DECODER_MTP,
37
};
38
39
enum llm_ffn_op_type : int {
40
    LLM_FFN_NONE = 0,           // sentinel: unset; archs must assign before use
41
    LLM_FFN_SILU,
42
    LLM_FFN_GELU,
43
    LLM_FFN_RELU,
44
    LLM_FFN_RELU_SQR,
45
    LLM_FFN_SWIGLU,
46
    LLM_FFN_GEGLU,
47
    LLM_FFN_REGLU,
48
    LLM_FFN_SWIGLU_OAI_MOE,
49
};
50
51
enum llm_ffn_gate_type {
52
    LLM_FFN_SEQ,
53
    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
54
};
55
56
enum llm_norm_type {
57
    LLM_NORM,
58
    LLM_NORM_RMS,
59
    LLM_NORM_GROUP,
60
};
61
62
// TODO: tmp - need something better to pass the data from the encoder to the decoder
63
struct llama_cross {
64
    // the output embeddings from the encoder as a ggml tensor
65
    // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
66
    //       ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
67
    //ggml_tensor * t_embd = nullptr;
68
69
    int64_t n_embd = 0;
70
    int64_t n_enc  = 0;
71
72
    // embeddings data copied to host memory (tmp)
73
    std::vector<float> v_embd;
74
75
    // needed to construct the cross-attention mask in the decoder
76
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
77
};
78
79
struct llm_graph_params;
80
81
//
82
// llm_graph_input
83
//
84
85
class llm_graph_input_i {
86
public:
87
0
    llm_graph_input_i() {
88
0
        const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
89
0
        debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
90
0
    }
91
92
0
    virtual ~llm_graph_input_i() = default;
93
94
    virtual void set_input(const llama_ubatch * ubatch) = 0;
95
96
    // return true if the resulting input tensors using the provided graph parameters would be
97
    //   the same as the previous input tensors that we have currently stored in the object
98
0
    virtual bool can_reuse(const llm_graph_params & params) {
99
        // returning false here by default will prevent from reusing the graph if the check
100
        //   for the input type has not been implemented yet
101
0
        GGML_UNUSED(params);
102
0
        return false;
103
0
    }
104
protected:
105
    // env: LLAMA_GRAPH_INPUT_DEBUG
106
    int debug = 0;
107
};
108
109
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
110
111
class llm_graph_input_embd : public llm_graph_input_i {
112
public:
113
0
    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
114
    virtual ~llm_graph_input_embd() = default;
115
116
    void set_input(const llama_ubatch * ubatch) override;
117
118
    bool can_reuse(const llm_graph_params & params) override;
119
120
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
121
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
122
123
    const int64_t n_embd = 0;
124
};
125
126
// similar to llm_graph_input_embd but with an additional hidden state input
127
class llm_graph_input_embd_h : public llm_graph_input_i {
128
public:
129
0
    llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {}
130
    virtual ~llm_graph_input_embd_h() = default;
131
132
    void set_input(const llama_ubatch * ubatch) override;
133
134
    bool can_reuse(const llm_graph_params & params) override;
135
136
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
137
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
138
    ggml_tensor * h      = nullptr; // F32 [n_embd, n_batch]
139
140
    const int64_t n_embd = 0;
141
};
142
143
class llm_graph_input_pos : public llm_graph_input_i {
144
public:
145
0
    llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
146
    virtual ~llm_graph_input_pos() = default;
147
148
    void set_input(const llama_ubatch * ubatch) override;
149
150
    bool can_reuse(const llm_graph_params & params) override;
151
152
    ggml_tensor * pos = nullptr; // I32 [n_batch]
153
154
    const uint32_t n_pos_per_embd = 1;
155
};
156
157
// temperature tuning, used by llama4
158
class llm_graph_input_attn_temp : public llm_graph_input_i {
159
public:
160
    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
161
0
        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
162
    virtual ~llm_graph_input_attn_temp() = default;
163
164
    void set_input(const llama_ubatch * ubatch) override;
165
166
    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
167
168
    const uint32_t n_attn_temp_floor_scale;
169
    const float    f_attn_temp_scale;
170
    const float    f_attn_temp_offset;
171
};
172
173
class llm_graph_input_pos_bucket : public llm_graph_input_i {
174
public:
175
0
    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
176
    virtual ~llm_graph_input_pos_bucket() = default;
177
178
    void set_input(const llama_ubatch * ubatch) override;
179
180
    ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
181
182
    const llama_hparams hparams;
183
};
184
185
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
186
public:
187
    llm_graph_input_pos_bucket_kv(
188
            const llama_hparams & hparams,
189
0
            const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
190
    virtual ~llm_graph_input_pos_bucket_kv() = default;
191
192
    void set_input(const llama_ubatch * ubatch) override;
193
194
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
195
196
    const llama_hparams hparams;
197
198
    const llama_kv_cache_context * mctx;
199
};
200
201
class llm_graph_input_out_ids : public llm_graph_input_i {
202
public:
203
    llm_graph_input_out_ids(
204
            const llama_hparams & hparams,
205
            const llama_cparams & cparams,
206
0
            uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
207
0
    virtual ~llm_graph_input_out_ids() = default;
208
209
    void set_input(const llama_ubatch * ubatch) override;
210
211
    bool can_reuse(const llm_graph_params & params) override;
212
213
    ggml_tensor * out_ids; // I32 [n_outputs]
214
215
    const llama_hparams hparams;
216
    const llama_cparams cparams;
217
218
    const uint32_t n_outputs;
219
};
220
221
class llm_graph_input_mean : public llm_graph_input_i {
222
public:
223
0
    llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
224
0
    virtual ~llm_graph_input_mean() = default;
225
226
    void set_input(const llama_ubatch * ubatch) override;
227
228
    ggml_tensor * mean; // F32 [n_batch, n_batch]
229
230
    const llama_cparams cparams;
231
};
232
233
class llm_graph_input_cls : public llm_graph_input_i {
234
public:
235
0
    llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
236
0
    virtual ~llm_graph_input_cls() = default;
237
238
    void set_input(const llama_ubatch * ubatch) override;
239
240
    ggml_tensor * cls; // I32 [n_batch]
241
242
    const llama_cparams cparams;
243
    const llm_arch arch;
244
};
245
246
class llm_graph_input_rs : public llm_graph_input_i {
247
public:
248
0
    llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
249
    virtual ~llm_graph_input_rs() = default;
250
251
    void set_input(const llama_ubatch * ubatch) override;
252
253
    bool can_reuse(const llm_graph_params & params) override;
254
255
    ggml_tensor * s_copy;  // I32 [n_rs]
256
257
    // views of s_copy, computed once per graph
258
    // and shared across layers which use build_rs
259
    ggml_tensor * s_copy_main;   // I32 [n_seqs]
260
    ggml_tensor * s_copy_extra;  // I32 [n_rs - n_seqs]
261
262
    const llama_memory_recurrent_context * mctx;
263
264
    // used in view offsets, need to match for valid graph reuse
265
    uint32_t head;
266
    int32_t rs_z;
267
};
268
269
class llm_graph_input_cross_embd : public llm_graph_input_i {
270
public:
271
    llm_graph_input_cross_embd(
272
0
            const llama_cross * cross) : cross(cross) {}
273
    virtual ~llm_graph_input_cross_embd() = default;
274
275
    void set_input(const llama_ubatch * ubatch) override;
276
277
    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
278
279
    const llama_cross * cross;
280
};
281
282
class llm_graph_input_attn_no_cache : public llm_graph_input_i {
283
public:
284
    llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
285
0
        hparams(hparams),
286
0
        cparams(cparams) {
287
0
    }
288
0
    ~llm_graph_input_attn_no_cache() = default;
289
290
    void set_input(const llama_ubatch * ubatch) override;
291
292
0
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
293
0
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
294
295
    // n_tokens == n_batch
296
    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
297
    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
298
    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
299
    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
300
301
    const llama_hparams hparams;
302
    const llama_cparams cparams;
303
};
304
305
class llm_graph_input_attn_kv : public llm_graph_input_i {
306
public:
307
    llm_graph_input_attn_kv(
308
            const llama_hparams & hparams,
309
            const llama_cparams & cparams,
310
            const llama_kv_cache_context * mctx) :
311
0
        hparams(hparams),
312
0
        cparams(cparams),
313
0
        mctx(mctx) {
314
0
    }
315
0
    ~llm_graph_input_attn_kv() = default;
316
317
    void set_input(const llama_ubatch * ubatch) override;
318
319
    bool can_reuse(const llm_graph_params & params) override;
320
321
0
    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
322
0
    ggml_tensor * get_v_idxs() const { return self_v_idxs; }
323
324
0
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
325
326
    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
327
    ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
328
329
    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
330
    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
331
332
    // note: assumes v_rot^2 == I
333
    ggml_tensor * self_k_rot = nullptr;
334
    ggml_tensor * self_v_rot = nullptr;
335
336
    // note: these have to be copies because in order to be able to reuse a graph, its inputs
337
    //       need to carry these parameters with them. otherwise, they can point to freed
338
    //       llm_graph_params from a previous batch, causing stack-use-after-return
339
    const llama_hparams hparams;
340
    const llama_cparams cparams;
341
342
    const llama_kv_cache_context * mctx;
343
};
344
345
// V-less input for the KV cache
346
// ref: https://github.com/ggml-org/llama.cpp/pull/19067
347
class llm_graph_input_attn_k : public llm_graph_input_i {
348
public:
349
    llm_graph_input_attn_k(
350
            const llama_hparams & hparams,
351
            const llama_cparams & cparams,
352
            const llama_kv_cache_context * mctx) :
353
0
        hparams(hparams),
354
0
        cparams(cparams),
355
0
        mctx(mctx) {
356
0
    }
357
0
    ~llm_graph_input_attn_k() = default;
358
359
    void set_input(const llama_ubatch * ubatch) override;
360
361
    bool can_reuse(const llm_graph_params & params) override;
362
363
0
    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
364
365
0
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
366
367
    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
368
369
    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
370
    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
371
372
    const llama_hparams hparams;
373
    const llama_cparams cparams;
374
375
    const llama_kv_cache_context * mctx;
376
};
377
378
class llm_graph_input_attn_k_dsa : public llm_graph_input_i {
379
public:
380
    llm_graph_input_attn_k_dsa(
381
            const llama_hparams & hparams,
382
            const llama_cparams & cparams,
383
            const llama_kv_cache_dsa_context * mctx) :
384
0
        hparams(hparams),
385
0
        cparams(cparams),
386
0
        mctx(mctx) {
387
0
    }
388
0
    ~llm_graph_input_attn_k_dsa() = default;
389
390
    void set_input(const llama_ubatch * ubatch) override;
391
392
    bool can_reuse(const llm_graph_params & params) override;
393
394
0
    ggml_tensor * get_k_idxs_mla() const { return self_k_idxs_mla; }
395
0
    ggml_tensor * get_k_idxs_lid() const { return self_k_idxs_lid; }
396
397
0
    ggml_tensor * get_kq_mask_mla() const { return self_kq_mask_mla_cnv; }
398
0
    ggml_tensor * get_kq_mask_lid() const { return self_kq_mask_lid; }
399
400
    ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch]
401
    ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch]
402
403
    ggml_tensor * self_kq_mask_mla     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
404
    ggml_tensor * self_kq_mask_mla_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
405
    ggml_tensor * self_kq_mask_lid     = nullptr; // F32     [n_kv, n_batch/n_stream, 1, n_stream]
406
    ggml_tensor * self_kq_mask_lid_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
407
408
    ggml_tensor * self_k_rot_lid = nullptr;
409
410
    const llama_hparams hparams;
411
    const llama_cparams cparams;
412
413
    const llama_kv_cache_dsa_context * mctx;
414
};
415
416
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
417
public:
418
    llm_graph_input_attn_kv_iswa(
419
            const llama_hparams & hparams,
420
            const llama_cparams & cparams,
421
            const llama_kv_cache_iswa_context * mctx) :
422
0
        hparams(hparams),
423
0
        cparams(cparams),
424
0
        mctx(mctx) {
425
0
    }
426
0
    ~llm_graph_input_attn_kv_iswa() = default;
427
428
    void set_input(const llama_ubatch * ubatch) override;
429
430
    bool can_reuse(const llm_graph_params & params) override;
431
432
0
    ggml_tensor * get_k_idxs()     const { return self_k_idxs; }
433
0
    ggml_tensor * get_v_idxs()     const { return self_v_idxs; }
434
0
    ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
435
0
    ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
436
437
0
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
438
0
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
439
440
    ggml_tensor * self_k_idxs     = nullptr; // I64 [n_batch]
441
    ggml_tensor * self_v_idxs     = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
442
    ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
443
    ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
444
445
    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
446
    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
447
    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
448
    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
449
450
    ggml_tensor * self_k_rot = nullptr;
451
    ggml_tensor * self_v_rot = nullptr;
452
453
    ggml_tensor * self_k_rot_swa = nullptr;
454
    ggml_tensor * self_v_rot_swa = nullptr;
455
456
    const llama_hparams hparams;
457
    const llama_cparams cparams;
458
459
    const llama_kv_cache_iswa_context * mctx;
460
};
461
462
class llm_graph_input_attn_cross : public llm_graph_input_i {
463
public:
464
0
    llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
465
    ~llm_graph_input_attn_cross() = default;
466
467
    void set_input(const llama_ubatch * ubatch) override;
468
469
0
    ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
470
471
    ggml_tensor * cross_kq_mask     = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
472
    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
473
474
    const llama_cross * cross = nullptr;
475
};
476
477
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
478
public:
479
    llm_graph_input_mem_hybrid(
480
            const llama_cparams & cparams,
481
            std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
482
            std::unique_ptr<llm_graph_input_rs>      inp_rs,
483
            const llama_memory_hybrid_context *      mctx) :
484
0
        inp_attn(std::move(inp_attn)),
485
0
        inp_rs(std::move(inp_rs)),
486
0
        cparams(cparams),
487
0
        mctx(mctx) { }
488
0
    virtual ~llm_graph_input_mem_hybrid() = default;
489
490
    void set_input(const llama_ubatch * ubatch) override;
491
492
    bool can_reuse(const llm_graph_params & params) override;
493
494
    std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
495
    std::unique_ptr<llm_graph_input_rs>      inp_rs;
496
497
0
    llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
498
0
    llm_graph_input_rs      * get_recr() const { return inp_rs.get(); }
499
500
    const llama_cparams cparams;
501
502
    const llama_memory_hybrid_context * mctx;
503
};
504
505
class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
506
public:
507
    llm_graph_input_mem_hybrid_k(
508
            const llama_cparams & cparams,
509
            std::unique_ptr<llm_graph_input_attn_k> inp_attn,
510
            std::unique_ptr<llm_graph_input_rs>      inp_rs,
511
            const llama_memory_hybrid_context *      mctx) :
512
0
        inp_attn(std::move(inp_attn)),
513
0
        inp_rs(std::move(inp_rs)),
514
0
        cparams(cparams),
515
0
        mctx(mctx) { }
516
0
    virtual ~llm_graph_input_mem_hybrid_k() = default;
517
518
    void set_input(const llama_ubatch * ubatch) override;
519
520
    bool can_reuse(const llm_graph_params & params) override;
521
522
    std::unique_ptr<llm_graph_input_attn_k> inp_attn;
523
    std::unique_ptr<llm_graph_input_rs>      inp_rs;
524
525
0
    llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
526
0
    llm_graph_input_rs      * get_recr() const { return inp_rs.get(); }
527
528
    const llama_cparams cparams;
529
530
    const llama_memory_hybrid_context * mctx;
531
};
532
533
class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
534
public:
535
    llm_graph_input_mem_hybrid_iswa(
536
            const llama_cparams & cparams,
537
            std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
538
            std::unique_ptr<llm_graph_input_rs>          inp_rs,
539
            const llama_memory_hybrid_iswa_context *     mctx) :
540
0
        inp_attn(std::move(inp_attn)),
541
0
        inp_rs(std::move(inp_rs)),
542
0
        cparams(cparams),
543
0
        mctx(mctx) { }
544
0
    virtual ~llm_graph_input_mem_hybrid_iswa() = default;
545
546
    void set_input(const llama_ubatch * ubatch) override;
547
548
    bool can_reuse(const llm_graph_params & params) override;
549
550
    std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
551
    std::unique_ptr<llm_graph_input_rs>          inp_rs;
552
553
0
    llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
554
0
    llm_graph_input_rs           * get_recr() const { return inp_rs.get(); }
555
556
    const llama_cparams cparams;
557
558
    const llama_memory_hybrid_iswa_context * mctx;
559
};
560
561
class llm_graph_input_sampling : public llm_graph_input_i {
562
public:
563
    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
564
0
        samplers(std::move(samplers)) { }
565
0
    virtual ~llm_graph_input_sampling() = default;
566
567
    void set_input(const llama_ubatch * ubatch) override;
568
    bool can_reuse(const llm_graph_params & params) override;
569
570
    std::map<llama_seq_id, llama_sampler *> samplers;
571
};
572
573
//
574
// llm_graph_result
575
//
576
577
// these objects deliver the result from the graph build process back to the llama_context
578
// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
579
//   specific data, by calling the set_inputs() method
580
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
581
//   these are used by the llama_context to extact the relevant data, based on the compute parameters
582
583
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
584
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
585
586
class llm_graph_result;
587
588
struct llm_graph_params {
589
    llm_arch arch = LLM_ARCH_UNKNOWN;
590
591
    llama_hparams hparams;
592
    llama_cparams cparams;
593
594
    llama_ubatch ubatch; // note: intentionally make a copy
595
596
    llm_graph_type gtype;
597
598
    ggml_backend_sched_t sched;
599
    ggml_backend_t backend_cpu;
600
601
    const llama_adapter_cvec     * cvec;
602
    const llama_adapter_loras    * loras;
603
    const llama_memory_context_i * mctx;
604
    const llama_cross            * cross;
605
606
    std::map<llama_seq_id, llama_sampler *> samplers;
607
608
    static bool samplers_equal(
609
          const std::map<llama_seq_id, llama_sampler *> & lhs,
610
0
          const std::map<llama_seq_id, llama_sampler *> & rhs) {
611
0
        if (lhs.size() != rhs.size()) {
612
0
            return false;
613
0
        }
614
0
        for (const auto & [seq_id, sampler] : lhs) {
615
0
            auto it = rhs.find(seq_id);
616
0
            if (it == rhs.end() || it->second != sampler) {
617
0
                return false;
618
0
            }
619
0
        }
620
0
        return true;
621
0
    }
622
623
    uint32_t n_outputs;
624
625
    llm_graph_cb cb;
626
627
    llm_graph_result * res;
628
629
    // return true if the "other" params would result in a graph with the same topology as with the current params
630
    //   having the same topology allows us to reuse the graph in some cases
631
0
    bool allow_reuse(const llm_graph_params & other) const {
632
        // first check the ubatch
633
0
        bool can_reuse_ubatch =
634
0
            ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
635
0
            ubatch.n_tokens     == other.ubatch.n_tokens &&
636
0
            ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
637
0
            ubatch.n_seqs       == other.ubatch.n_seqs &&
638
0
            ubatch.n_seqs_unq   == other.ubatch.n_seqs_unq &&
639
0
            (
640
0
                (!ubatch.token && !other.ubatch.token) ||
641
0
                (!ubatch.embd  && !other.ubatch.embd)  ||
642
0
                (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd)
643
0
            );
644
645
        // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
646
        //   the reason is because the set of attention streams would be different for different sequences
647
0
        if (can_reuse_ubatch && ubatch.equal_seqs()) {
648
0
            if (!ubatch.data) {
649
                // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
650
                //   therefore we cannot perform the sequence id check. normally should never happen
651
0
                can_reuse_ubatch = false;
652
0
            } else {
653
0
                for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
654
0
                    can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
655
0
                }
656
0
            }
657
0
        }
658
659
0
        if (!can_reuse_ubatch) {
660
0
            return false;
661
0
        }
662
663
0
        if (n_outputs != other.n_outputs) {
664
0
            return false;
665
0
        }
666
667
0
        if (!samplers_equal(samplers, other.samplers)) {
668
0
            return false;
669
0
        }
670
671
0
        if (samplers.size() > 0) {
672
0
            if (!ubatch.data || !other.ubatch.data) {
673
0
                return false;
674
0
            }
675
676
            // check that the outputs are the same for all samplers
677
0
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
678
0
                if (ubatch.output[i]    != other.ubatch.output[i] ||
679
0
                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
680
0
                    return false;
681
0
                }
682
0
            }
683
0
        }
684
685
0
        return
686
0
            cparams.embeddings  == other.cparams.embeddings  &&
687
0
            cparams.causal_attn == other.cparams.causal_attn &&
688
0
            arch  == other.arch  &&
689
0
            gtype == other.gtype &&
690
0
            cvec  == other.cvec  &&
691
0
            loras == other.loras &&
692
0
            cross == other.cross;
693
0
    }
694
};
695
696
class llm_graph_result {
697
public:
698
    llm_graph_result(int64_t max_nodes);
699
700
0
    virtual ~llm_graph_result() = default;
701
702
0
    ggml_tensor * get_inp_tokens()  const { return t_inp_tokens; }
703
0
    ggml_tensor * get_logits()      const { return t_logits; }
704
0
    ggml_tensor * get_embd()        const { return t_embd; }
705
0
    ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
706
0
    ggml_tensor * get_h_nextn()     const { return t_h_nextn; }
707
708
0
    ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
709
710
0
    ggml_cgraph  * get_gf()  const { return gf; }
711
0
    ggml_context * get_ctx() const { return ctx_compute.get(); }
712
713
    int64_t get_max_nodes() const;
714
715
    void reset();
716
717
    void set_inputs(const llama_ubatch * ubatch);
718
    void set_outputs(const llm_graph_params & params);
719
720
    // try to update the existing graph result using the new graph parameters in order to reuse it
721
    // this can only be done if we determine that the resulting graph using the new graph parameters
722
    //   would be identical to the existing graph. in that case, we simply have to update the memory
723
    //   contexts of the input tensors of the graph and we can reuse it for another computation
724
    // return true if the graph was updated and can be reused
725
    bool can_reuse(const llm_graph_params & params);
726
727
    llm_graph_input_i * add_input(llm_graph_input_ptr input);
728
729
    void set_params(const llm_graph_params & params);
730
731
    // important graph nodes
732
    ggml_tensor * t_inp_tokens  = nullptr;
733
    ggml_tensor * t_inp_embd    = nullptr; // [n_embd_inp, n_tokens]
734
    ggml_tensor * t_logits      = nullptr;
735
    ggml_tensor * t_embd        = nullptr;
736
    ggml_tensor * t_embd_pooled = nullptr;
737
    ggml_tensor * t_h_nextn     = nullptr; // [n_embd, n_outputs] hidden state before final output norm
738
739
    std::vector<ggml_tensor *> t_layer_inp;
740
741
    std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
742
    std::map<llama_seq_id, ggml_tensor *> t_candidates;
743
    std::map<llama_seq_id, ggml_tensor *> t_sampled;
744
    std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
745
746
    std::vector<llm_graph_input_ptr> inputs;
747
748
    ggml_context_ptr ctx_compute;
749
750
    // memory buffers used to evaluate the model
751
    std::vector<uint8_t> buf_compute_meta;
752
753
    ggml_cgraph * gf;
754
755
    int64_t max_nodes;
756
757
private:
758
    // keep a copy of the previous graph parameters
759
    // we will use this to determine whether the graph can be reused by comparing them with the new parameters
760
    // note: these are updated after constructing the new graph
761
    llm_graph_params params;
762
763
    // env: LLAMA_GRAPH_RESULT_DEBUG
764
    int debug = 0;
765
};
766
767
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
768
769
//
770
// llm_graph_context
771
//
772
773
// used in build_rs to properly order writes and avoid unnecessary copies
774
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
775
776
struct llm_graph_qkv {
777
    ggml_tensor * q; // [n_embd_head, n_head,    n_tokens]
778
    ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens]
779
    ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens]
780
};
781
782
struct llm_graph_context {
783
    const llm_arch arch;
784
785
    const llama_hparams & hparams;
786
    const llama_cparams & cparams;
787
    const llama_ubatch  & ubatch;
788
789
    const int64_t n_embd;
790
    const int64_t n_layer;
791
    const int64_t n_layer_nextn;
792
    const int64_t n_rot;
793
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
794
    const int64_t n_head;
795
    const int64_t n_head_kv;
796
    const int64_t n_embd_head_k;
797
    const int64_t n_embd_k_gqa;
798
    const int64_t n_embd_head_v;
799
    const int64_t n_embd_v_gqa;
800
    const int64_t n_expert;
801
    const int64_t n_expert_used;
802
803
    const float freq_base;
804
    const float freq_scale;
805
    const float ext_factor;
806
    const float attn_factor;
807
    const float beta_fast;
808
    const float beta_slow;
809
    const float norm_eps;
810
    const float norm_rms_eps;
811
812
    const int64_t n_tokens;
813
    const int64_t n_outputs;
814
    const int32_t n_ctx_orig; // yarn
815
816
    const enum llama_pooling_type pooling_type;
817
    const enum llama_rope_type    rope_type;
818
819
    ggml_backend_sched_t sched;
820
821
    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
822
823
    const llama_adapter_cvec     * cvec;
824
    const llama_adapter_loras    * loras;
825
    const llama_memory_context_i * mctx;
826
    const llama_cross            * cross;
827
828
    std::map<llama_seq_id, llama_sampler *> samplers;
829
830
    const llm_graph_cb & cb_func;
831
832
    llm_graph_result * res;
833
834
    ggml_context * ctx0 = nullptr;
835
    ggml_cgraph  * gf   = nullptr;
836
837
    llm_graph_context(const llm_graph_params & params);
838
0
    virtual ~llm_graph_context() = default;
839
840
    void cb(ggml_tensor * cur, const char * name, int il) const;
841
842
    //
843
    // common
844
    //
845
846
    ggml_tensor * build_cvec(
847
             ggml_tensor * cur,
848
                     int   il) const;
849
850
    // do mat_mul, while optionally apply lora and per-tensor scale
851
    ggml_tensor * build_lora_mm(
852
              ggml_tensor * w,
853
              ggml_tensor * cur,
854
              ggml_tensor * w_s = nullptr) const;
855
856
    // do mat_mul_id, while optionally apply lora
857
    ggml_tensor * build_lora_mm_id(
858
              ggml_tensor * w,   // ggml_tensor * as
859
              ggml_tensor * cur, // ggml_tensor * b
860
              ggml_tensor * ids) const;
861
862
    ggml_tensor * build_norm(
863
             ggml_tensor * cur,
864
             ggml_tensor * mw,
865
             ggml_tensor * mb,
866
           llm_norm_type   type,
867
                     int   il) const;
868
869
870
    // compute Q, K, V projections with optional bias and reshape
871
    // supports both fused wqkv and separate wq/wk/wv paths
872
    llm_graph_qkv build_qkv(
873
        const llama_layer & layer,
874
              ggml_tensor * cur,
875
                  int64_t   n_embd_head,
876
                  int64_t   n_head,
877
                  int64_t   n_head_kv,
878
                      int   il) const;
879
880
    ggml_tensor * build_ffn(
881
             ggml_tensor * cur,
882
             ggml_tensor * up,
883
             ggml_tensor * up_b,
884
             ggml_tensor * up_s,
885
             ggml_tensor * gate,
886
             ggml_tensor * gate_b,
887
             ggml_tensor * gate_s,
888
             ggml_tensor * down,
889
             ggml_tensor * down_b,
890
             ggml_tensor * down_s,
891
             ggml_tensor * act_scales,
892
         llm_ffn_op_type   type_op,
893
       llm_ffn_gate_type   type_gate,
894
                     int   il) const;
895
896
    // build MoE FFN without bias tensors
897
    ggml_tensor * build_moe_ffn(
898
             ggml_tensor * cur,
899
             ggml_tensor * gate_inp,
900
             ggml_tensor * up_exps,
901
             ggml_tensor * gate_exps,
902
             ggml_tensor * down_exps,
903
             ggml_tensor * exp_probs_b,
904
                 int64_t   n_expert,
905
                 int64_t   n_expert_used,
906
         llm_ffn_op_type   type_op,
907
                    bool   norm_w,
908
                   float   w_scale,
909
            llama_expert_gating_func_type gating_op,
910
                     int   il,
911
             ggml_tensor * probs_in = nullptr,
912
             ggml_tensor * gate_up_exps = nullptr,
913
             ggml_tensor * up_exps_s = nullptr,
914
             ggml_tensor * gate_exps_s = nullptr,
915
             ggml_tensor * down_exps_s = nullptr) const;
916
917
    ggml_tensor * build_moe_ffn(
918
             ggml_tensor * cur,
919
             ggml_tensor * gate_inp,
920
             ggml_tensor * gate_inp_b,
921
             ggml_tensor * up_exps,
922
             ggml_tensor * up_exps_b,
923
             ggml_tensor * gate_exps,
924
             ggml_tensor * gate_exps_b,
925
             ggml_tensor * down_exps,
926
             ggml_tensor * down_exps_b,
927
             ggml_tensor * exp_probs_b,
928
                 int64_t   n_expert,
929
                 int64_t   n_expert_used,
930
         llm_ffn_op_type   type_op,
931
                    bool   norm_w,
932
                   float   w_scale,
933
            llama_expert_gating_func_type gating_op,
934
                     int   il,
935
             ggml_tensor * probs_in = nullptr,
936
             ggml_tensor * gate_up_exps = nullptr,
937
             ggml_tensor * gate_up_exps_b = nullptr,
938
             ggml_tensor * up_exps_s = nullptr,
939
             ggml_tensor * gate_exps_s = nullptr,
940
             ggml_tensor * down_exps_s = nullptr) const;
941
942
    //
943
    // inputs
944
    //
945
946
    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
947
    ggml_tensor * build_inp_pos() const;
948
    ggml_tensor * build_inp_attn_scale() const;
949
    ggml_tensor * build_inp_out_ids() const;
950
    ggml_tensor * build_inp_mean() const;
951
    ggml_tensor * build_inp_cls() const;
952
953
    ggml_tensor * build_inp_cross_embd() const;
954
    ggml_tensor * build_inp_pos_bucket_enc() const;
955
    ggml_tensor * build_inp_pos_bucket_dec() const;
956
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
957
958
    //
959
    // attention
960
    //
961
962
    ggml_tensor * build_attn_mha(
963
            ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
964
            ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
965
            ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
966
            ggml_tensor * kq_b,
967
            ggml_tensor * kq_mask,
968
            ggml_tensor * sinks,   // [n_head_q]
969
            ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
970
                  float   kq_scale,
971
                    int   il) const;
972
973
    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
974
975
    ggml_tensor * build_attn(
976
            llm_graph_input_attn_no_cache * inp,
977
            ggml_tensor * wo,
978
            ggml_tensor * wo_b,
979
            ggml_tensor * wo_s,
980
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
981
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
982
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
983
            ggml_tensor * kq_b,
984
            ggml_tensor * sinks, // [n_head_q]
985
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
986
                  float   kq_scale,
987
                    int   il) const;
988
989
    llm_graph_input_attn_kv * build_attn_inp_kv() const;
990
991
    ggml_tensor * build_attn(
992
            llm_graph_input_attn_kv * inp,
993
            ggml_tensor * wo,
994
            ggml_tensor * wo_b,
995
            ggml_tensor * wo_s,
996
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
997
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
998
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
999
            ggml_tensor * kq_b,
1000
            ggml_tensor * sinks, // [n_head_q]
1001
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
1002
                  float   kq_scale,
1003
                    int   il) const;
1004
1005
    llm_graph_input_attn_k  * build_attn_inp_k() const;
1006
1007
    ggml_tensor * build_attn(
1008
            llm_graph_input_attn_k * inp,
1009
            ggml_tensor * wo,
1010
            ggml_tensor * wo_b,
1011
            ggml_tensor * wo_s,
1012
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
1013
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
1014
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
1015
            ggml_tensor * kq_b,
1016
            ggml_tensor * sinks, // [n_head_q]
1017
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
1018
                  float   kq_scale,
1019
                    int   il) const;
1020
1021
    llm_graph_input_attn_k_dsa * build_attn_inp_k_dsa() const;
1022
1023
    ggml_tensor * build_attn(
1024
            llm_graph_input_attn_k_dsa * inp,
1025
            ggml_tensor * wo,
1026
            ggml_tensor * wo_b,
1027
            ggml_tensor * wo_s,
1028
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
1029
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
1030
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
1031
            ggml_tensor * kq_b,
1032
            ggml_tensor * sinks, // [n_head_q]
1033
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
1034
            ggml_tensor * top_k, // [n_indexer_top_k, n_tokens]
1035
                  float   kq_scale,
1036
                    int   il) const;
1037
1038
    llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
1039
1040
    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
1041
    ggml_tensor * build_attn(
1042
            llm_graph_input_attn_kv_iswa * inp,
1043
            ggml_tensor * wo,
1044
            ggml_tensor * wo_b,
1045
            ggml_tensor * wo_s,
1046
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
1047
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
1048
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
1049
            ggml_tensor * kq_b,
1050
            ggml_tensor * sinks, // [n_head_q]
1051
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
1052
                  float   kq_scale,
1053
                    int   il) const;
1054
1055
    llm_graph_input_attn_cross * build_attn_inp_cross() const;
1056
1057
    ggml_tensor * build_attn(
1058
            llm_graph_input_attn_cross * inp,
1059
            ggml_tensor * wo,
1060
            ggml_tensor * wo_b,
1061
            ggml_tensor * wo_s,
1062
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
1063
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
1064
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
1065
            ggml_tensor * kq_b,
1066
            ggml_tensor * sinks, // [n_head_q]
1067
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
1068
                  float   kq_scale,
1069
                    int   il) const;
1070
1071
    //
1072
    // recurrent
1073
    //
1074
1075
    // TODO: move this implementation to llama_memory_recurrent.
1076
    //       this is analogous to llama_kv_cache::cpy_k / cpy_v
1077
    //       when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
1078
    //         implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
1079
    //         `llama_memory_recurrent`
1080
    ggml_tensor * build_rs(
1081
            ggml_tensor * s,
1082
            ggml_tensor * state_copy_main,
1083
            ggml_tensor * state_copy_extra,
1084
                int32_t   state_size,
1085
                int32_t   n_seqs,
1086
               uint32_t   n_rs,
1087
               uint32_t   rs_head,
1088
               uint32_t   rs_size,
1089
                int32_t   rs_zero,
1090
            const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
1091
1092
    llm_graph_input_rs * build_rs_inp() const;
1093
1094
    ggml_tensor * build_rs(
1095
            llm_graph_input_rs * inp,
1096
            ggml_tensor * s,
1097
                int32_t   state_size,
1098
                int32_t   n_seqs,
1099
            const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
1100
1101
    ggml_tensor * build_rwkv_token_shift_load(
1102
        llm_graph_input_rs * inp,
1103
        const llama_ubatch & ubatch,
1104
                       int   il) const;
1105
1106
    ggml_tensor * build_rwkv_token_shift_store(
1107
             ggml_tensor * token_shift,
1108
      const llama_ubatch & ubatch,
1109
                     int   il) const;
1110
    //
1111
    // hybrid
1112
    //
1113
1114
    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
1115
    llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
1116
1117
    llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
1118
1119
    //
1120
    // pooling
1121
    //
1122
1123
    void build_pooling(
1124
            ggml_tensor * cls,
1125
            ggml_tensor * cls_b,
1126
            ggml_tensor * cls_out,
1127
            ggml_tensor * cls_out_b,
1128
            ggml_tensor * cls_norm) const;
1129
1130
    //
1131
    // sampling (backend sampling)
1132
    //
1133
1134
    void build_sampling() const;
1135
1136
    //
1137
    // dense (out)
1138
    //
1139
1140
    void build_dense_out(
1141
            ggml_tensor * dense_2,
1142
            ggml_tensor * dense_2_b,
1143
            ggml_tensor * dense_3) const;
1144
};
1145
1146
// TODO: better name
1147
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);