/src/llama.cpp/src/llama-batch.h

Source
#pragma once

#include "llama.h"

#include "llama-cparams.h"

#include <array>
#include <vector>
#include <set>
#include <bitset>
#include <memory>
#include <unordered_map>

// keep this struct lightweight
struct llama_ubatch {
    bool equal_seqs() const {
        return b_equal_seqs != 0;
    }

    // typical for M-RoPE cases:
    //   0 - sequantial position of the tokens/embeddings in the sequence
    //   1 - y position in the image
    //   2 - x position in the image
    //   3 - other
    bool is_pos_2d() const {
        // TODO @ngxson : we may need to check for model arch when more models use >1 positions
        return n_pos >= 3;
    }

    uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
                           //       otherwise address sanitizer complains
    // TODO: whole_seqs for embeddings?

    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
    uint32_t n_seq_tokens; // tokens per sequence set
    uint32_t n_seqs;       // sequence sets in the ubatch
    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
    uint32_t n_pos;        // number of position inputs for each token/embedding

    // seq_id_unq: unique sequence ids in the ubatch
    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
    //             used for extracting sequence pooled embeddings

    //                          // size               | idx | val
    llama_token  *  token;      // [n_tokens]         | i   | id, token
    float        *  embd;       // [n_embd, n_tokens] | i   | embd
    llama_pos    *  pos;        // [n_tokens*n_pos]   | i   | pos
    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
    int8_t       *  output;     // [n_tokens]         | i   | -

    struct data_t {
        std::vector<llama_token>    token;
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;

        std::vector<llama_seq_id> seq_id_data;
    };

    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
    std::shared_ptr<data_t> data;
};

// a helper for sanitizing, fulfilling and splitting a batch
class llama_batch_allocr {
public:
    llama_batch_allocr(uint32_t n_pos_per_embd);

    // sanitize and auto-gen missing data in the input batch
    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
    bool init(
            const llama_batch & batch_inp,
            const llama_vocab & vocab,
            const llama_memory_i * memory,
            uint32_t n_embd,
            uint32_t n_seq_max,
            bool output_all);

    const llama_batch & get_batch() const;

    uint32_t get_n_tokens()  const;
    uint32_t get_n_outputs() const;
    uint32_t get_n_used()    const;

    // the array of output indices in the order they were encountered during the ubatch splitting
    std::vector<int32_t> & get_out_ids();

    // min/max positions of each sequence in the current ubatch
    llama_pos seq_pos_min(llama_seq_id seq_id) const;
    llama_pos seq_pos_max(llama_seq_id seq_id) const;

    // call once before splitting the batch to reset the internal state
    void split_reset();

    // simple split, unknown number of sequence sets of unequal lengths
    llama_ubatch split_simple(uint32_t n_ubatch);

    // make ubatches of equal-length sequences sets
    // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
    llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);

    // sequence-set-wise split - each ubatch contains a single sequence-set
    llama_ubatch split_seq(uint32_t n_ubatch);

    // a helper method for creating a well-defined ubatch of tokens
    // TODO: support embeddings if needed in the future
    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);

private:
    void clear();

    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);

    // for debugging, start with LLAMA_BATCH_DEBUG=2
    void ubatch_print(const llama_ubatch & ubatch, int debug);

    llama_batch batch;

    // only for debugging purposes
    const llama_vocab * vocab;

    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
    const uint32_t n_pos_per_embd;

    uint32_t n_embd;
    uint32_t n_seq_max;
    uint32_t n_outputs;

    std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id

    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id *> seq_id;
    std::vector<llama_seq_id>   seq_id_unq;
    std::vector<int32_t>        seq_idx;
    std::vector<int8_t>         output;

    using pos_set_t = std::set<llama_pos>;
    using seq_cpl_t = std::vector<bool>;

    // helper flag to quickly determine if there are any coupled sequences in the batch
    bool has_cpl = false;

    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1

    using idx_vec_t = std::vector<int32_t>;
    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;

    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i

    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears

    // batch indices of the output
    std::vector<int32_t> out_ids;

    uint32_t n_used;

    // used[i] indicates if token i has already been used in a previous ubatch
    std::vector<bool> used;

    int debug;
};

Line	Count	Source
1		#pragma once
2
3		#include "llama.h"
4
5		#include "llama-cparams.h"
6
7		#include <array>
8		#include <vector>
9		#include <set>
10		#include <bitset>
11		#include <memory>
12		#include <unordered_map>
13
14		// keep this struct lightweight
15		struct llama_ubatch {
16	0	bool equal_seqs() const {
17	0	return b_equal_seqs != 0;
18	0	}
19
20		// typical for M-RoPE cases:
21		// 0 - sequantial position of the tokens/embeddings in the sequence
22		// 1 - y position in the image
23		// 2 - x position in the image
24		// 3 - other
25	0	bool is_pos_2d() const {
26		// TODO @ngxson : we may need to check for model arch when more models use >1 positions
27	0	return n_pos >= 3;
28	0	}
29
30		uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
31		// otherwise address sanitizer complains
32		// TODO: whole_seqs for embeddings?
33
34		uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
35		uint32_t n_seq_tokens; // tokens per sequence set
36		uint32_t n_seqs; // sequence sets in the ubatch
37		uint32_t n_seqs_unq; // unique sequence ids in the ubatch
38		uint32_t n_pos; // number of position inputs for each token/embedding
39
40		// seq_id_unq: unique sequence ids in the ubatch
41		// seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
42		// used for extracting sequence pooled embeddings
43
44		// // size \| idx \| val
45		llama_token * token; // [n_tokens] \| i \| id, token
46		float * embd; // [n_embd, n_tokens] \| i \| embd
47		llama_pos * pos; // [n_tokens*n_pos] \| i \| pos
48		int32_t * n_seq_id; // [n_tokens] \| i \| -
49		llama_seq_id ** seq_id; // [n_tokens] \| s \| s0, s1, seq_id
50		llama_seq_id * seq_id_unq; // [n_seqs_unq] \| s \| seq_id
51		int32_t * seq_idx; // [LLAMA_MAX_SEQ] \| - \| seq_idx
52		int8_t * output; // [n_tokens] \| i \| -
53
54		struct data_t {
55		std::vector<llama_token> token;
56		std::vector<float> embd;
57		std::vector<llama_pos> pos;
58		std::vector<int32_t> n_seq_id;
59		std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
60		std::vector<llama_seq_id> seq_id_unq;
61		std::vector<int32_t> seq_idx;
62		std::vector<int8_t> output;
63
64		std::vector<llama_seq_id> seq_id_data;
65		};
66
67		// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
68		std::shared_ptr<data_t> data;
69		};
70
71		// a helper for sanitizing, fulfilling and splitting a batch
72		class llama_batch_allocr {
73		public:
74		llama_batch_allocr(uint32_t n_pos_per_embd);
75
76		// sanitize and auto-gen missing data in the input batch
77		// memory is optional. if provided will be used to check for sequence continuity and to determine the positions
78		bool init(
79		const llama_batch & batch_inp,
80		const llama_vocab & vocab,
81		const llama_memory_i * memory,
82		uint32_t n_embd,
83		uint32_t n_seq_max,
84		bool output_all);
85
86		const llama_batch & get_batch() const;
87
88		uint32_t get_n_tokens() const;
89		uint32_t get_n_outputs() const;
90		uint32_t get_n_used() const;
91
92		// the array of output indices in the order they were encountered during the ubatch splitting
93		std::vector<int32_t> & get_out_ids();
94
95		// min/max positions of each sequence in the current ubatch
96		llama_pos seq_pos_min(llama_seq_id seq_id) const;
97		llama_pos seq_pos_max(llama_seq_id seq_id) const;
98
99		// call once before splitting the batch to reset the internal state
100		void split_reset();
101
102		// simple split, unknown number of sequence sets of unequal lengths
103		llama_ubatch split_simple(uint32_t n_ubatch);
104
105		// make ubatches of equal-length sequences sets
106		// if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
107		llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
108
109		// sequence-set-wise split - each ubatch contains a single sequence-set
110		llama_ubatch split_seq(uint32_t n_ubatch);
111
112		// a helper method for creating a well-defined ubatch of tokens
113		// TODO: support embeddings if needed in the future
114		llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
115
116		private:
117		void clear();
118
119		// create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
120		// return llama_ubatch.n_tokens == 0 if the entire batch was consumed
121		llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
122
123		// for debugging, start with LLAMA_BATCH_DEBUG=2
124		void ubatch_print(const llama_ubatch & ubatch, int debug);
125
126		llama_batch batch;
127
128		// only for debugging purposes
129		const llama_vocab * vocab;
130
131		// TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
132		// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
133		const uint32_t n_pos_per_embd;
134
135		uint32_t n_embd;
136		uint32_t n_seq_max;
137		uint32_t n_outputs;
138
139		std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
140
141		std::vector<llama_pos> pos;
142		std::vector<int32_t> n_seq_id;
143		std::vector<llama_seq_id *> seq_id;
144		std::vector<llama_seq_id> seq_id_unq;
145		std::vector<int32_t> seq_idx;
146		std::vector<int8_t> output;
147
148		using pos_set_t = std::set<llama_pos>;
149		using seq_cpl_t = std::vector<bool>;
150
151		// helper flag to quickly determine if there are any coupled sequences in the batch
152		bool has_cpl = false;
153
154		std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
155		std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
156
157		using idx_vec_t = std::vector<int32_t>;
158		using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
159
160		std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
161
162		std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
163
164		// batch indices of the output
165		std::vector<int32_t> out_ids;
166
167		uint32_t n_used;
168
169		// used[i] indicates if token i has already been used in a previous ubatch
170		std::vector<bool> used;
171
172		int debug;
173		};

Coverage Report

Created: 2025-12-28 06:26