/src/llama.cpp/src/llama-graph.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama-arch.h" |
4 | | #include "llama-batch.h" |
5 | | #include "llama-hparams.h" |
6 | | #include "llama-adapter.h" |
7 | | |
8 | | #include <cstdint> |
9 | | #include <vector> |
10 | | #include <memory> |
11 | | #include <set> |
12 | | #include <functional> |
13 | | #include <map> |
14 | | |
15 | | struct ggml_cgraph; |
16 | | struct ggml_context; |
17 | | struct ggml_tensor; |
18 | | |
19 | | struct llama_cparams; |
20 | | struct llama_layer; |
21 | | |
22 | | struct llama_memory_context_i; |
23 | | |
24 | | class llama_kv_cache_context; |
25 | | class llama_kv_cache_dsa_context; |
26 | | class llama_kv_cache_iswa_context; |
27 | | class llama_memory_recurrent_context; |
28 | | class llama_memory_hybrid_context; |
29 | | class llama_memory_hybrid_iswa_context; |
30 | | |
31 | | // certain models (typically multi-modal) can produce different types of graphs |
32 | | enum llm_graph_type { |
33 | | LLM_GRAPH_TYPE_DEFAULT, |
34 | | LLM_GRAPH_TYPE_ENCODER, |
35 | | LLM_GRAPH_TYPE_DECODER, |
36 | | LLM_GRAPH_TYPE_DECODER_MTP, |
37 | | }; |
38 | | |
39 | | enum llm_ffn_op_type : int { |
40 | | LLM_FFN_NONE = 0, // sentinel: unset; archs must assign before use |
41 | | LLM_FFN_SILU, |
42 | | LLM_FFN_GELU, |
43 | | LLM_FFN_RELU, |
44 | | LLM_FFN_RELU_SQR, |
45 | | LLM_FFN_SWIGLU, |
46 | | LLM_FFN_GEGLU, |
47 | | LLM_FFN_REGLU, |
48 | | LLM_FFN_SWIGLU_OAI_MOE, |
49 | | }; |
50 | | |
51 | | enum llm_ffn_gate_type { |
52 | | LLM_FFN_SEQ, |
53 | | LLM_FFN_PAR, // ffn_gate is parallel to ffn_up |
54 | | }; |
55 | | |
56 | | enum llm_norm_type { |
57 | | LLM_NORM, |
58 | | LLM_NORM_RMS, |
59 | | LLM_NORM_GROUP, |
60 | | }; |
61 | | |
62 | | // TODO: tmp - need something better to pass the data from the encoder to the decoder |
63 | | struct llama_cross { |
64 | | // the output embeddings from the encoder as a ggml tensor |
65 | | // TODO: this needs more work to be correct, for now copy the embeddings data to host memory |
66 | | // ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524 |
67 | | //ggml_tensor * t_embd = nullptr; |
68 | | |
69 | | int64_t n_embd = 0; |
70 | | int64_t n_enc = 0; |
71 | | |
72 | | // embeddings data copied to host memory (tmp) |
73 | | std::vector<float> v_embd; |
74 | | |
75 | | // needed to construct the cross-attention mask in the decoder |
76 | | std::vector<std::set<llama_seq_id>> seq_ids_enc; |
77 | | }; |
78 | | |
79 | | struct llm_graph_params; |
80 | | |
81 | | // |
82 | | // llm_graph_input |
83 | | // |
84 | | |
85 | | class llm_graph_input_i { |
86 | | public: |
87 | 0 | llm_graph_input_i() { |
88 | 0 | const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG"); |
89 | 0 | debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0; |
90 | 0 | } |
91 | | |
92 | 0 | virtual ~llm_graph_input_i() = default; |
93 | | |
94 | | virtual void set_input(const llama_ubatch * ubatch) = 0; |
95 | | |
96 | | // return true if the resulting input tensors using the provided graph parameters would be |
97 | | // the same as the previous input tensors that we have currently stored in the object |
98 | 0 | virtual bool can_reuse(const llm_graph_params & params) { |
99 | | // returning false here by default will prevent from reusing the graph if the check |
100 | | // for the input type has not been implemented yet |
101 | 0 | GGML_UNUSED(params); |
102 | 0 | return false; |
103 | 0 | } |
104 | | protected: |
105 | | // env: LLAMA_GRAPH_INPUT_DEBUG |
106 | | int debug = 0; |
107 | | }; |
108 | | |
109 | | using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>; |
110 | | |
111 | | class llm_graph_input_embd : public llm_graph_input_i { |
112 | | public: |
113 | 0 | llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} |
114 | | virtual ~llm_graph_input_embd() = default; |
115 | | |
116 | | void set_input(const llama_ubatch * ubatch) override; |
117 | | |
118 | | bool can_reuse(const llm_graph_params & params) override; |
119 | | |
120 | | ggml_tensor * tokens = nullptr; // I32 [n_batch] |
121 | | ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] |
122 | | |
123 | | const int64_t n_embd = 0; |
124 | | }; |
125 | | |
126 | | // similar to llm_graph_input_embd but with an additional hidden state input |
127 | | class llm_graph_input_embd_h : public llm_graph_input_i { |
128 | | public: |
129 | 0 | llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {} |
130 | | virtual ~llm_graph_input_embd_h() = default; |
131 | | |
132 | | void set_input(const llama_ubatch * ubatch) override; |
133 | | |
134 | | bool can_reuse(const llm_graph_params & params) override; |
135 | | |
136 | | ggml_tensor * tokens = nullptr; // I32 [n_batch] |
137 | | ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] |
138 | | ggml_tensor * h = nullptr; // F32 [n_embd, n_batch] |
139 | | |
140 | | const int64_t n_embd = 0; |
141 | | }; |
142 | | |
143 | | class llm_graph_input_pos : public llm_graph_input_i { |
144 | | public: |
145 | 0 | llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} |
146 | | virtual ~llm_graph_input_pos() = default; |
147 | | |
148 | | void set_input(const llama_ubatch * ubatch) override; |
149 | | |
150 | | bool can_reuse(const llm_graph_params & params) override; |
151 | | |
152 | | ggml_tensor * pos = nullptr; // I32 [n_batch] |
153 | | |
154 | | const uint32_t n_pos_per_embd = 1; |
155 | | }; |
156 | | |
157 | | // temperature tuning, used by llama4 |
158 | | class llm_graph_input_attn_temp : public llm_graph_input_i { |
159 | | public: |
160 | | llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset) |
161 | 0 | : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {} |
162 | | virtual ~llm_graph_input_attn_temp() = default; |
163 | | |
164 | | void set_input(const llama_ubatch * ubatch) override; |
165 | | |
166 | | ggml_tensor * attn_scale = nullptr; // F32 [n_batch] |
167 | | |
168 | | const uint32_t n_attn_temp_floor_scale; |
169 | | const float f_attn_temp_scale; |
170 | | const float f_attn_temp_offset; |
171 | | }; |
172 | | |
173 | | class llm_graph_input_pos_bucket : public llm_graph_input_i { |
174 | | public: |
175 | 0 | llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} |
176 | | virtual ~llm_graph_input_pos_bucket() = default; |
177 | | |
178 | | void set_input(const llama_ubatch * ubatch) override; |
179 | | |
180 | | ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch] |
181 | | |
182 | | const llama_hparams hparams; |
183 | | }; |
184 | | |
185 | | class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { |
186 | | public: |
187 | | llm_graph_input_pos_bucket_kv( |
188 | | const llama_hparams & hparams, |
189 | 0 | const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {} |
190 | | virtual ~llm_graph_input_pos_bucket_kv() = default; |
191 | | |
192 | | void set_input(const llama_ubatch * ubatch) override; |
193 | | |
194 | | ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch] |
195 | | |
196 | | const llama_hparams hparams; |
197 | | |
198 | | const llama_kv_cache_context * mctx; |
199 | | }; |
200 | | |
201 | | class llm_graph_input_out_ids : public llm_graph_input_i { |
202 | | public: |
203 | | llm_graph_input_out_ids( |
204 | | const llama_hparams & hparams, |
205 | | const llama_cparams & cparams, |
206 | 0 | uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} |
207 | 0 | virtual ~llm_graph_input_out_ids() = default; |
208 | | |
209 | | void set_input(const llama_ubatch * ubatch) override; |
210 | | |
211 | | bool can_reuse(const llm_graph_params & params) override; |
212 | | |
213 | | ggml_tensor * out_ids; // I32 [n_outputs] |
214 | | |
215 | | const llama_hparams hparams; |
216 | | const llama_cparams cparams; |
217 | | |
218 | | const uint32_t n_outputs; |
219 | | }; |
220 | | |
221 | | class llm_graph_input_mean : public llm_graph_input_i { |
222 | | public: |
223 | 0 | llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} |
224 | 0 | virtual ~llm_graph_input_mean() = default; |
225 | | |
226 | | void set_input(const llama_ubatch * ubatch) override; |
227 | | |
228 | | ggml_tensor * mean; // F32 [n_batch, n_batch] |
229 | | |
230 | | const llama_cparams cparams; |
231 | | }; |
232 | | |
233 | | class llm_graph_input_cls : public llm_graph_input_i { |
234 | | public: |
235 | 0 | llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {} |
236 | 0 | virtual ~llm_graph_input_cls() = default; |
237 | | |
238 | | void set_input(const llama_ubatch * ubatch) override; |
239 | | |
240 | | ggml_tensor * cls; // I32 [n_batch] |
241 | | |
242 | | const llama_cparams cparams; |
243 | | const llm_arch arch; |
244 | | }; |
245 | | |
246 | | class llm_graph_input_rs : public llm_graph_input_i { |
247 | | public: |
248 | 0 | llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {} |
249 | | virtual ~llm_graph_input_rs() = default; |
250 | | |
251 | | void set_input(const llama_ubatch * ubatch) override; |
252 | | |
253 | | bool can_reuse(const llm_graph_params & params) override; |
254 | | |
255 | | ggml_tensor * s_copy; // I32 [n_rs] |
256 | | |
257 | | // views of s_copy, computed once per graph |
258 | | // and shared across layers which use build_rs |
259 | | ggml_tensor * s_copy_main; // I32 [n_seqs] |
260 | | ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs] |
261 | | |
262 | | const llama_memory_recurrent_context * mctx; |
263 | | |
264 | | // used in view offsets, need to match for valid graph reuse |
265 | | uint32_t head; |
266 | | int32_t rs_z; |
267 | | }; |
268 | | |
269 | | class llm_graph_input_cross_embd : public llm_graph_input_i { |
270 | | public: |
271 | | llm_graph_input_cross_embd( |
272 | 0 | const llama_cross * cross) : cross(cross) {} |
273 | | virtual ~llm_graph_input_cross_embd() = default; |
274 | | |
275 | | void set_input(const llama_ubatch * ubatch) override; |
276 | | |
277 | | ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] |
278 | | |
279 | | const llama_cross * cross; |
280 | | }; |
281 | | |
282 | | class llm_graph_input_attn_no_cache : public llm_graph_input_i { |
283 | | public: |
284 | | llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) : |
285 | 0 | hparams(hparams), |
286 | 0 | cparams(cparams) { |
287 | 0 | } |
288 | 0 | ~llm_graph_input_attn_no_cache() = default; |
289 | | |
290 | | void set_input(const llama_ubatch * ubatch) override; |
291 | | |
292 | 0 | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
293 | 0 | ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } |
294 | | |
295 | | // n_tokens == n_batch |
296 | | ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream] |
297 | | ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] |
298 | | ggml_tensor * self_kq_mask_swa = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream] |
299 | | ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] |
300 | | |
301 | | const llama_hparams hparams; |
302 | | const llama_cparams cparams; |
303 | | }; |
304 | | |
305 | | class llm_graph_input_attn_kv : public llm_graph_input_i { |
306 | | public: |
307 | | llm_graph_input_attn_kv( |
308 | | const llama_hparams & hparams, |
309 | | const llama_cparams & cparams, |
310 | | const llama_kv_cache_context * mctx) : |
311 | 0 | hparams(hparams), |
312 | 0 | cparams(cparams), |
313 | 0 | mctx(mctx) { |
314 | 0 | } |
315 | 0 | ~llm_graph_input_attn_kv() = default; |
316 | | |
317 | | void set_input(const llama_ubatch * ubatch) override; |
318 | | |
319 | | bool can_reuse(const llm_graph_params & params) override; |
320 | | |
321 | 0 | ggml_tensor * get_k_idxs() const { return self_k_idxs; } |
322 | 0 | ggml_tensor * get_v_idxs() const { return self_v_idxs; } |
323 | | |
324 | 0 | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
325 | | |
326 | | ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] |
327 | | ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] |
328 | | |
329 | | ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] |
330 | | ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
331 | | |
332 | | // note: assumes v_rot^2 == I |
333 | | ggml_tensor * self_k_rot = nullptr; |
334 | | ggml_tensor * self_v_rot = nullptr; |
335 | | |
336 | | // note: these have to be copies because in order to be able to reuse a graph, its inputs |
337 | | // need to carry these parameters with them. otherwise, they can point to freed |
338 | | // llm_graph_params from a previous batch, causing stack-use-after-return |
339 | | const llama_hparams hparams; |
340 | | const llama_cparams cparams; |
341 | | |
342 | | const llama_kv_cache_context * mctx; |
343 | | }; |
344 | | |
345 | | // V-less input for the KV cache |
346 | | // ref: https://github.com/ggml-org/llama.cpp/pull/19067 |
347 | | class llm_graph_input_attn_k : public llm_graph_input_i { |
348 | | public: |
349 | | llm_graph_input_attn_k( |
350 | | const llama_hparams & hparams, |
351 | | const llama_cparams & cparams, |
352 | | const llama_kv_cache_context * mctx) : |
353 | 0 | hparams(hparams), |
354 | 0 | cparams(cparams), |
355 | 0 | mctx(mctx) { |
356 | 0 | } |
357 | 0 | ~llm_graph_input_attn_k() = default; |
358 | | |
359 | | void set_input(const llama_ubatch * ubatch) override; |
360 | | |
361 | | bool can_reuse(const llm_graph_params & params) override; |
362 | | |
363 | 0 | ggml_tensor * get_k_idxs() const { return self_k_idxs; } |
364 | | |
365 | 0 | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
366 | | |
367 | | ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] |
368 | | |
369 | | ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] |
370 | | ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
371 | | |
372 | | const llama_hparams hparams; |
373 | | const llama_cparams cparams; |
374 | | |
375 | | const llama_kv_cache_context * mctx; |
376 | | }; |
377 | | |
378 | | class llm_graph_input_attn_k_dsa : public llm_graph_input_i { |
379 | | public: |
380 | | llm_graph_input_attn_k_dsa( |
381 | | const llama_hparams & hparams, |
382 | | const llama_cparams & cparams, |
383 | | const llama_kv_cache_dsa_context * mctx) : |
384 | 0 | hparams(hparams), |
385 | 0 | cparams(cparams), |
386 | 0 | mctx(mctx) { |
387 | 0 | } |
388 | 0 | ~llm_graph_input_attn_k_dsa() = default; |
389 | | |
390 | | void set_input(const llama_ubatch * ubatch) override; |
391 | | |
392 | | bool can_reuse(const llm_graph_params & params) override; |
393 | | |
394 | 0 | ggml_tensor * get_k_idxs_mla() const { return self_k_idxs_mla; } |
395 | 0 | ggml_tensor * get_k_idxs_lid() const { return self_k_idxs_lid; } |
396 | | |
397 | 0 | ggml_tensor * get_kq_mask_mla() const { return self_kq_mask_mla_cnv; } |
398 | 0 | ggml_tensor * get_kq_mask_lid() const { return self_kq_mask_lid; } |
399 | | |
400 | | ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch] |
401 | | ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch] |
402 | | |
403 | | ggml_tensor * self_kq_mask_mla = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] |
404 | | ggml_tensor * self_kq_mask_mla_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
405 | | ggml_tensor * self_kq_mask_lid = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] |
406 | | ggml_tensor * self_kq_mask_lid_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
407 | | |
408 | | ggml_tensor * self_k_rot_lid = nullptr; |
409 | | |
410 | | const llama_hparams hparams; |
411 | | const llama_cparams cparams; |
412 | | |
413 | | const llama_kv_cache_dsa_context * mctx; |
414 | | }; |
415 | | |
416 | | class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { |
417 | | public: |
418 | | llm_graph_input_attn_kv_iswa( |
419 | | const llama_hparams & hparams, |
420 | | const llama_cparams & cparams, |
421 | | const llama_kv_cache_iswa_context * mctx) : |
422 | 0 | hparams(hparams), |
423 | 0 | cparams(cparams), |
424 | 0 | mctx(mctx) { |
425 | 0 | } |
426 | 0 | ~llm_graph_input_attn_kv_iswa() = default; |
427 | | |
428 | | void set_input(const llama_ubatch * ubatch) override; |
429 | | |
430 | | bool can_reuse(const llm_graph_params & params) override; |
431 | | |
432 | 0 | ggml_tensor * get_k_idxs() const { return self_k_idxs; } |
433 | 0 | ggml_tensor * get_v_idxs() const { return self_v_idxs; } |
434 | 0 | ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } |
435 | 0 | ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; } |
436 | | |
437 | 0 | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
438 | 0 | ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } |
439 | | |
440 | | ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] |
441 | | ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] |
442 | | ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch] |
443 | | ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] |
444 | | |
445 | | ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] |
446 | | ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
447 | | ggml_tensor * self_kq_mask_swa = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] |
448 | | ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] |
449 | | |
450 | | ggml_tensor * self_k_rot = nullptr; |
451 | | ggml_tensor * self_v_rot = nullptr; |
452 | | |
453 | | ggml_tensor * self_k_rot_swa = nullptr; |
454 | | ggml_tensor * self_v_rot_swa = nullptr; |
455 | | |
456 | | const llama_hparams hparams; |
457 | | const llama_cparams cparams; |
458 | | |
459 | | const llama_kv_cache_iswa_context * mctx; |
460 | | }; |
461 | | |
462 | | class llm_graph_input_attn_cross : public llm_graph_input_i { |
463 | | public: |
464 | 0 | llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {} |
465 | | ~llm_graph_input_attn_cross() = default; |
466 | | |
467 | | void set_input(const llama_ubatch * ubatch) override; |
468 | | |
469 | 0 | ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; } |
470 | | |
471 | | ggml_tensor * cross_kq_mask = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1] |
472 | | ggml_tensor * cross_kq_mask_cnv = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1] |
473 | | |
474 | | const llama_cross * cross = nullptr; |
475 | | }; |
476 | | |
477 | | class llm_graph_input_mem_hybrid : public llm_graph_input_i { |
478 | | public: |
479 | | llm_graph_input_mem_hybrid( |
480 | | const llama_cparams & cparams, |
481 | | std::unique_ptr<llm_graph_input_attn_kv> inp_attn, |
482 | | std::unique_ptr<llm_graph_input_rs> inp_rs, |
483 | | const llama_memory_hybrid_context * mctx) : |
484 | 0 | inp_attn(std::move(inp_attn)), |
485 | 0 | inp_rs(std::move(inp_rs)), |
486 | 0 | cparams(cparams), |
487 | 0 | mctx(mctx) { } |
488 | 0 | virtual ~llm_graph_input_mem_hybrid() = default; |
489 | | |
490 | | void set_input(const llama_ubatch * ubatch) override; |
491 | | |
492 | | bool can_reuse(const llm_graph_params & params) override; |
493 | | |
494 | | std::unique_ptr<llm_graph_input_attn_kv> inp_attn; |
495 | | std::unique_ptr<llm_graph_input_rs> inp_rs; |
496 | | |
497 | 0 | llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); } |
498 | 0 | llm_graph_input_rs * get_recr() const { return inp_rs.get(); } |
499 | | |
500 | | const llama_cparams cparams; |
501 | | |
502 | | const llama_memory_hybrid_context * mctx; |
503 | | }; |
504 | | |
505 | | class llm_graph_input_mem_hybrid_k : public llm_graph_input_i { |
506 | | public: |
507 | | llm_graph_input_mem_hybrid_k( |
508 | | const llama_cparams & cparams, |
509 | | std::unique_ptr<llm_graph_input_attn_k> inp_attn, |
510 | | std::unique_ptr<llm_graph_input_rs> inp_rs, |
511 | | const llama_memory_hybrid_context * mctx) : |
512 | 0 | inp_attn(std::move(inp_attn)), |
513 | 0 | inp_rs(std::move(inp_rs)), |
514 | 0 | cparams(cparams), |
515 | 0 | mctx(mctx) { } |
516 | 0 | virtual ~llm_graph_input_mem_hybrid_k() = default; |
517 | | |
518 | | void set_input(const llama_ubatch * ubatch) override; |
519 | | |
520 | | bool can_reuse(const llm_graph_params & params) override; |
521 | | |
522 | | std::unique_ptr<llm_graph_input_attn_k> inp_attn; |
523 | | std::unique_ptr<llm_graph_input_rs> inp_rs; |
524 | | |
525 | 0 | llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); } |
526 | 0 | llm_graph_input_rs * get_recr() const { return inp_rs.get(); } |
527 | | |
528 | | const llama_cparams cparams; |
529 | | |
530 | | const llama_memory_hybrid_context * mctx; |
531 | | }; |
532 | | |
533 | | class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i { |
534 | | public: |
535 | | llm_graph_input_mem_hybrid_iswa( |
536 | | const llama_cparams & cparams, |
537 | | std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn, |
538 | | std::unique_ptr<llm_graph_input_rs> inp_rs, |
539 | | const llama_memory_hybrid_iswa_context * mctx) : |
540 | 0 | inp_attn(std::move(inp_attn)), |
541 | 0 | inp_rs(std::move(inp_rs)), |
542 | 0 | cparams(cparams), |
543 | 0 | mctx(mctx) { } |
544 | 0 | virtual ~llm_graph_input_mem_hybrid_iswa() = default; |
545 | | |
546 | | void set_input(const llama_ubatch * ubatch) override; |
547 | | |
548 | | bool can_reuse(const llm_graph_params & params) override; |
549 | | |
550 | | std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn; |
551 | | std::unique_ptr<llm_graph_input_rs> inp_rs; |
552 | | |
553 | 0 | llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); } |
554 | 0 | llm_graph_input_rs * get_recr() const { return inp_rs.get(); } |
555 | | |
556 | | const llama_cparams cparams; |
557 | | |
558 | | const llama_memory_hybrid_iswa_context * mctx; |
559 | | }; |
560 | | |
561 | | class llm_graph_input_sampling : public llm_graph_input_i { |
562 | | public: |
563 | | llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) : |
564 | 0 | samplers(std::move(samplers)) { } |
565 | 0 | virtual ~llm_graph_input_sampling() = default; |
566 | | |
567 | | void set_input(const llama_ubatch * ubatch) override; |
568 | | bool can_reuse(const llm_graph_params & params) override; |
569 | | |
570 | | std::map<llama_seq_id, llama_sampler *> samplers; |
571 | | }; |
572 | | |
573 | | // |
574 | | // llm_graph_result |
575 | | // |
576 | | |
577 | | // these objects deliver the result from the graph build process back to the llama_context |
578 | | // note that the input tensors created for the graph are referenced here - the goal is to be able to populate their |
579 | | // specific data, by calling the set_inputs() method |
580 | | // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. |
581 | | // these are used by the llama_context to extact the relevant data, based on the compute parameters |
582 | | |
583 | | // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) |
584 | | using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>; |
585 | | |
586 | | class llm_graph_result; |
587 | | |
588 | | struct llm_graph_params { |
589 | | llm_arch arch = LLM_ARCH_UNKNOWN; |
590 | | |
591 | | llama_hparams hparams; |
592 | | llama_cparams cparams; |
593 | | |
594 | | llama_ubatch ubatch; // note: intentionally make a copy |
595 | | |
596 | | llm_graph_type gtype; |
597 | | |
598 | | ggml_backend_sched_t sched; |
599 | | ggml_backend_t backend_cpu; |
600 | | |
601 | | const llama_adapter_cvec * cvec; |
602 | | const llama_adapter_loras * loras; |
603 | | const llama_memory_context_i * mctx; |
604 | | const llama_cross * cross; |
605 | | |
606 | | std::map<llama_seq_id, llama_sampler *> samplers; |
607 | | |
608 | | static bool samplers_equal( |
609 | | const std::map<llama_seq_id, llama_sampler *> & lhs, |
610 | 0 | const std::map<llama_seq_id, llama_sampler *> & rhs) { |
611 | 0 | if (lhs.size() != rhs.size()) { |
612 | 0 | return false; |
613 | 0 | } |
614 | 0 | for (const auto & [seq_id, sampler] : lhs) { |
615 | 0 | auto it = rhs.find(seq_id); |
616 | 0 | if (it == rhs.end() || it->second != sampler) { |
617 | 0 | return false; |
618 | 0 | } |
619 | 0 | } |
620 | 0 | return true; |
621 | 0 | } |
622 | | |
623 | | uint32_t n_outputs; |
624 | | |
625 | | llm_graph_cb cb; |
626 | | |
627 | | llm_graph_result * res; |
628 | | |
629 | | // return true if the "other" params would result in a graph with the same topology as with the current params |
630 | | // having the same topology allows us to reuse the graph in some cases |
631 | 0 | bool allow_reuse(const llm_graph_params & other) const { |
632 | | // first check the ubatch |
633 | 0 | bool can_reuse_ubatch = |
634 | 0 | ubatch.equal_seqs() == other.ubatch.equal_seqs() && |
635 | 0 | ubatch.n_tokens == other.ubatch.n_tokens && |
636 | 0 | ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && |
637 | 0 | ubatch.n_seqs == other.ubatch.n_seqs && |
638 | 0 | ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && |
639 | 0 | ( |
640 | 0 | (!ubatch.token && !other.ubatch.token) || |
641 | 0 | (!ubatch.embd && !other.ubatch.embd) || |
642 | 0 | (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) |
643 | 0 | ); |
644 | | |
645 | | // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same |
646 | | // the reason is because the set of attention streams would be different for different sequences |
647 | 0 | if (can_reuse_ubatch && ubatch.equal_seqs()) { |
648 | 0 | if (!ubatch.data) { |
649 | | // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and |
650 | | // therefore we cannot perform the sequence id check. normally should never happen |
651 | 0 | can_reuse_ubatch = false; |
652 | 0 | } else { |
653 | 0 | for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { |
654 | 0 | can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s]; |
655 | 0 | } |
656 | 0 | } |
657 | 0 | } |
658 | |
|
659 | 0 | if (!can_reuse_ubatch) { |
660 | 0 | return false; |
661 | 0 | } |
662 | | |
663 | 0 | if (n_outputs != other.n_outputs) { |
664 | 0 | return false; |
665 | 0 | } |
666 | | |
667 | 0 | if (!samplers_equal(samplers, other.samplers)) { |
668 | 0 | return false; |
669 | 0 | } |
670 | | |
671 | 0 | if (samplers.size() > 0) { |
672 | 0 | if (!ubatch.data || !other.ubatch.data) { |
673 | 0 | return false; |
674 | 0 | } |
675 | | |
676 | | // check that the outputs are the same for all samplers |
677 | 0 | for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { |
678 | 0 | if (ubatch.output[i] != other.ubatch.output[i] || |
679 | 0 | ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) { |
680 | 0 | return false; |
681 | 0 | } |
682 | 0 | } |
683 | 0 | } |
684 | | |
685 | 0 | return |
686 | 0 | cparams.embeddings == other.cparams.embeddings && |
687 | 0 | cparams.causal_attn == other.cparams.causal_attn && |
688 | 0 | arch == other.arch && |
689 | 0 | gtype == other.gtype && |
690 | 0 | cvec == other.cvec && |
691 | 0 | loras == other.loras && |
692 | 0 | cross == other.cross; |
693 | 0 | } |
694 | | }; |
695 | | |
696 | | class llm_graph_result { |
697 | | public: |
698 | | llm_graph_result(int64_t max_nodes); |
699 | | |
700 | 0 | virtual ~llm_graph_result() = default; |
701 | | |
702 | 0 | ggml_tensor * get_inp_tokens() const { return t_inp_tokens; } |
703 | 0 | ggml_tensor * get_logits() const { return t_logits; } |
704 | 0 | ggml_tensor * get_embd() const { return t_embd; } |
705 | 0 | ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } |
706 | 0 | ggml_tensor * get_h_nextn() const { return t_h_nextn; } |
707 | | |
708 | 0 | ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; } |
709 | | |
710 | 0 | ggml_cgraph * get_gf() const { return gf; } |
711 | 0 | ggml_context * get_ctx() const { return ctx_compute.get(); } |
712 | | |
713 | | int64_t get_max_nodes() const; |
714 | | |
715 | | void reset(); |
716 | | |
717 | | void set_inputs(const llama_ubatch * ubatch); |
718 | | void set_outputs(const llm_graph_params & params); |
719 | | |
720 | | // try to update the existing graph result using the new graph parameters in order to reuse it |
721 | | // this can only be done if we determine that the resulting graph using the new graph parameters |
722 | | // would be identical to the existing graph. in that case, we simply have to update the memory |
723 | | // contexts of the input tensors of the graph and we can reuse it for another computation |
724 | | // return true if the graph was updated and can be reused |
725 | | bool can_reuse(const llm_graph_params & params); |
726 | | |
727 | | llm_graph_input_i * add_input(llm_graph_input_ptr input); |
728 | | |
729 | | void set_params(const llm_graph_params & params); |
730 | | |
731 | | // important graph nodes |
732 | | ggml_tensor * t_inp_tokens = nullptr; |
733 | | ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens] |
734 | | ggml_tensor * t_logits = nullptr; |
735 | | ggml_tensor * t_embd = nullptr; |
736 | | ggml_tensor * t_embd_pooled = nullptr; |
737 | | ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm |
738 | | |
739 | | std::vector<ggml_tensor *> t_layer_inp; |
740 | | |
741 | | std::map<llama_seq_id, ggml_tensor *> t_sampled_logits; |
742 | | std::map<llama_seq_id, ggml_tensor *> t_candidates; |
743 | | std::map<llama_seq_id, ggml_tensor *> t_sampled; |
744 | | std::map<llama_seq_id, ggml_tensor *> t_sampled_probs; |
745 | | |
746 | | std::vector<llm_graph_input_ptr> inputs; |
747 | | |
748 | | ggml_context_ptr ctx_compute; |
749 | | |
750 | | // memory buffers used to evaluate the model |
751 | | std::vector<uint8_t> buf_compute_meta; |
752 | | |
753 | | ggml_cgraph * gf; |
754 | | |
755 | | int64_t max_nodes; |
756 | | |
757 | | private: |
758 | | // keep a copy of the previous graph parameters |
759 | | // we will use this to determine whether the graph can be reused by comparing them with the new parameters |
760 | | // note: these are updated after constructing the new graph |
761 | | llm_graph_params params; |
762 | | |
763 | | // env: LLAMA_GRAPH_RESULT_DEBUG |
764 | | int debug = 0; |
765 | | }; |
766 | | |
767 | | using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>; |
768 | | |
769 | | // |
770 | | // llm_graph_context |
771 | | // |
772 | | |
773 | | // used in build_rs to properly order writes and avoid unnecessary copies |
774 | | using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>; |
775 | | |
776 | | struct llm_graph_qkv { |
777 | | ggml_tensor * q; // [n_embd_head, n_head, n_tokens] |
778 | | ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens] |
779 | | ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens] |
780 | | }; |
781 | | |
782 | | struct llm_graph_context { |
783 | | const llm_arch arch; |
784 | | |
785 | | const llama_hparams & hparams; |
786 | | const llama_cparams & cparams; |
787 | | const llama_ubatch & ubatch; |
788 | | |
789 | | const int64_t n_embd; |
790 | | const int64_t n_layer; |
791 | | const int64_t n_layer_nextn; |
792 | | const int64_t n_rot; |
793 | | const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) |
794 | | const int64_t n_head; |
795 | | const int64_t n_head_kv; |
796 | | const int64_t n_embd_head_k; |
797 | | const int64_t n_embd_k_gqa; |
798 | | const int64_t n_embd_head_v; |
799 | | const int64_t n_embd_v_gqa; |
800 | | const int64_t n_expert; |
801 | | const int64_t n_expert_used; |
802 | | |
803 | | const float freq_base; |
804 | | const float freq_scale; |
805 | | const float ext_factor; |
806 | | const float attn_factor; |
807 | | const float beta_fast; |
808 | | const float beta_slow; |
809 | | const float norm_eps; |
810 | | const float norm_rms_eps; |
811 | | |
812 | | const int64_t n_tokens; |
813 | | const int64_t n_outputs; |
814 | | const int32_t n_ctx_orig; // yarn |
815 | | |
816 | | const enum llama_pooling_type pooling_type; |
817 | | const enum llama_rope_type rope_type; |
818 | | |
819 | | ggml_backend_sched_t sched; |
820 | | |
821 | | ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? |
822 | | |
823 | | const llama_adapter_cvec * cvec; |
824 | | const llama_adapter_loras * loras; |
825 | | const llama_memory_context_i * mctx; |
826 | | const llama_cross * cross; |
827 | | |
828 | | std::map<llama_seq_id, llama_sampler *> samplers; |
829 | | |
830 | | const llm_graph_cb & cb_func; |
831 | | |
832 | | llm_graph_result * res; |
833 | | |
834 | | ggml_context * ctx0 = nullptr; |
835 | | ggml_cgraph * gf = nullptr; |
836 | | |
837 | | llm_graph_context(const llm_graph_params & params); |
838 | 0 | virtual ~llm_graph_context() = default; |
839 | | |
840 | | void cb(ggml_tensor * cur, const char * name, int il) const; |
841 | | |
842 | | // |
843 | | // common |
844 | | // |
845 | | |
846 | | ggml_tensor * build_cvec( |
847 | | ggml_tensor * cur, |
848 | | int il) const; |
849 | | |
850 | | // do mat_mul, while optionally apply lora and per-tensor scale |
851 | | ggml_tensor * build_lora_mm( |
852 | | ggml_tensor * w, |
853 | | ggml_tensor * cur, |
854 | | ggml_tensor * w_s = nullptr) const; |
855 | | |
856 | | // do mat_mul_id, while optionally apply lora |
857 | | ggml_tensor * build_lora_mm_id( |
858 | | ggml_tensor * w, // ggml_tensor * as |
859 | | ggml_tensor * cur, // ggml_tensor * b |
860 | | ggml_tensor * ids) const; |
861 | | |
862 | | ggml_tensor * build_norm( |
863 | | ggml_tensor * cur, |
864 | | ggml_tensor * mw, |
865 | | ggml_tensor * mb, |
866 | | llm_norm_type type, |
867 | | int il) const; |
868 | | |
869 | | |
870 | | // compute Q, K, V projections with optional bias and reshape |
871 | | // supports both fused wqkv and separate wq/wk/wv paths |
872 | | llm_graph_qkv build_qkv( |
873 | | const llama_layer & layer, |
874 | | ggml_tensor * cur, |
875 | | int64_t n_embd_head, |
876 | | int64_t n_head, |
877 | | int64_t n_head_kv, |
878 | | int il) const; |
879 | | |
880 | | ggml_tensor * build_ffn( |
881 | | ggml_tensor * cur, |
882 | | ggml_tensor * up, |
883 | | ggml_tensor * up_b, |
884 | | ggml_tensor * up_s, |
885 | | ggml_tensor * gate, |
886 | | ggml_tensor * gate_b, |
887 | | ggml_tensor * gate_s, |
888 | | ggml_tensor * down, |
889 | | ggml_tensor * down_b, |
890 | | ggml_tensor * down_s, |
891 | | ggml_tensor * act_scales, |
892 | | llm_ffn_op_type type_op, |
893 | | llm_ffn_gate_type type_gate, |
894 | | int il) const; |
895 | | |
896 | | // build MoE FFN without bias tensors |
897 | | ggml_tensor * build_moe_ffn( |
898 | | ggml_tensor * cur, |
899 | | ggml_tensor * gate_inp, |
900 | | ggml_tensor * up_exps, |
901 | | ggml_tensor * gate_exps, |
902 | | ggml_tensor * down_exps, |
903 | | ggml_tensor * exp_probs_b, |
904 | | int64_t n_expert, |
905 | | int64_t n_expert_used, |
906 | | llm_ffn_op_type type_op, |
907 | | bool norm_w, |
908 | | float w_scale, |
909 | | llama_expert_gating_func_type gating_op, |
910 | | int il, |
911 | | ggml_tensor * probs_in = nullptr, |
912 | | ggml_tensor * gate_up_exps = nullptr, |
913 | | ggml_tensor * up_exps_s = nullptr, |
914 | | ggml_tensor * gate_exps_s = nullptr, |
915 | | ggml_tensor * down_exps_s = nullptr) const; |
916 | | |
917 | | ggml_tensor * build_moe_ffn( |
918 | | ggml_tensor * cur, |
919 | | ggml_tensor * gate_inp, |
920 | | ggml_tensor * gate_inp_b, |
921 | | ggml_tensor * up_exps, |
922 | | ggml_tensor * up_exps_b, |
923 | | ggml_tensor * gate_exps, |
924 | | ggml_tensor * gate_exps_b, |
925 | | ggml_tensor * down_exps, |
926 | | ggml_tensor * down_exps_b, |
927 | | ggml_tensor * exp_probs_b, |
928 | | int64_t n_expert, |
929 | | int64_t n_expert_used, |
930 | | llm_ffn_op_type type_op, |
931 | | bool norm_w, |
932 | | float w_scale, |
933 | | llama_expert_gating_func_type gating_op, |
934 | | int il, |
935 | | ggml_tensor * probs_in = nullptr, |
936 | | ggml_tensor * gate_up_exps = nullptr, |
937 | | ggml_tensor * gate_up_exps_b = nullptr, |
938 | | ggml_tensor * up_exps_s = nullptr, |
939 | | ggml_tensor * gate_exps_s = nullptr, |
940 | | ggml_tensor * down_exps_s = nullptr) const; |
941 | | |
942 | | // |
943 | | // inputs |
944 | | // |
945 | | |
946 | | ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; |
947 | | ggml_tensor * build_inp_pos() const; |
948 | | ggml_tensor * build_inp_attn_scale() const; |
949 | | ggml_tensor * build_inp_out_ids() const; |
950 | | ggml_tensor * build_inp_mean() const; |
951 | | ggml_tensor * build_inp_cls() const; |
952 | | |
953 | | ggml_tensor * build_inp_cross_embd() const; |
954 | | ggml_tensor * build_inp_pos_bucket_enc() const; |
955 | | ggml_tensor * build_inp_pos_bucket_dec() const; |
956 | | ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; |
957 | | |
958 | | // |
959 | | // attention |
960 | | // |
961 | | |
962 | | ggml_tensor * build_attn_mha( |
963 | | ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] |
964 | | ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] |
965 | | ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) |
966 | | ggml_tensor * kq_b, |
967 | | ggml_tensor * kq_mask, |
968 | | ggml_tensor * sinks, // [n_head_q] |
969 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
970 | | float kq_scale, |
971 | | int il) const; |
972 | | |
973 | | llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const; |
974 | | |
975 | | ggml_tensor * build_attn( |
976 | | llm_graph_input_attn_no_cache * inp, |
977 | | ggml_tensor * wo, |
978 | | ggml_tensor * wo_b, |
979 | | ggml_tensor * wo_s, |
980 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
981 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] |
982 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] |
983 | | ggml_tensor * kq_b, |
984 | | ggml_tensor * sinks, // [n_head_q] |
985 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
986 | | float kq_scale, |
987 | | int il) const; |
988 | | |
989 | | llm_graph_input_attn_kv * build_attn_inp_kv() const; |
990 | | |
991 | | ggml_tensor * build_attn( |
992 | | llm_graph_input_attn_kv * inp, |
993 | | ggml_tensor * wo, |
994 | | ggml_tensor * wo_b, |
995 | | ggml_tensor * wo_s, |
996 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
997 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] |
998 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] |
999 | | ggml_tensor * kq_b, |
1000 | | ggml_tensor * sinks, // [n_head_q] |
1001 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove |
1002 | | float kq_scale, |
1003 | | int il) const; |
1004 | | |
1005 | | llm_graph_input_attn_k * build_attn_inp_k() const; |
1006 | | |
1007 | | ggml_tensor * build_attn( |
1008 | | llm_graph_input_attn_k * inp, |
1009 | | ggml_tensor * wo, |
1010 | | ggml_tensor * wo_b, |
1011 | | ggml_tensor * wo_s, |
1012 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
1013 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] |
1014 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] |
1015 | | ggml_tensor * kq_b, |
1016 | | ggml_tensor * sinks, // [n_head_q] |
1017 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
1018 | | float kq_scale, |
1019 | | int il) const; |
1020 | | |
1021 | | llm_graph_input_attn_k_dsa * build_attn_inp_k_dsa() const; |
1022 | | |
1023 | | ggml_tensor * build_attn( |
1024 | | llm_graph_input_attn_k_dsa * inp, |
1025 | | ggml_tensor * wo, |
1026 | | ggml_tensor * wo_b, |
1027 | | ggml_tensor * wo_s, |
1028 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
1029 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] |
1030 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] |
1031 | | ggml_tensor * kq_b, |
1032 | | ggml_tensor * sinks, // [n_head_q] |
1033 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
1034 | | ggml_tensor * top_k, // [n_indexer_top_k, n_tokens] |
1035 | | float kq_scale, |
1036 | | int il) const; |
1037 | | |
1038 | | llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const; |
1039 | | |
1040 | | // note: if k_cur or v_cur are not provided, they will not be stored in the memory |
1041 | | ggml_tensor * build_attn( |
1042 | | llm_graph_input_attn_kv_iswa * inp, |
1043 | | ggml_tensor * wo, |
1044 | | ggml_tensor * wo_b, |
1045 | | ggml_tensor * wo_s, |
1046 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
1047 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional |
1048 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional |
1049 | | ggml_tensor * kq_b, |
1050 | | ggml_tensor * sinks, // [n_head_q] |
1051 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
1052 | | float kq_scale, |
1053 | | int il) const; |
1054 | | |
1055 | | llm_graph_input_attn_cross * build_attn_inp_cross() const; |
1056 | | |
1057 | | ggml_tensor * build_attn( |
1058 | | llm_graph_input_attn_cross * inp, |
1059 | | ggml_tensor * wo, |
1060 | | ggml_tensor * wo_b, |
1061 | | ggml_tensor * wo_s, |
1062 | | ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] |
1063 | | ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] |
1064 | | ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] |
1065 | | ggml_tensor * kq_b, |
1066 | | ggml_tensor * sinks, // [n_head_q] |
1067 | | ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] |
1068 | | float kq_scale, |
1069 | | int il) const; |
1070 | | |
1071 | | // |
1072 | | // recurrent |
1073 | | // |
1074 | | |
1075 | | // TODO: move this implementation to llama_memory_recurrent. |
1076 | | // this is analogous to llama_kv_cache::cpy_k / cpy_v |
1077 | | // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the |
1078 | | // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in |
1079 | | // `llama_memory_recurrent` |
1080 | | ggml_tensor * build_rs( |
1081 | | ggml_tensor * s, |
1082 | | ggml_tensor * state_copy_main, |
1083 | | ggml_tensor * state_copy_extra, |
1084 | | int32_t state_size, |
1085 | | int32_t n_seqs, |
1086 | | uint32_t n_rs, |
1087 | | uint32_t rs_head, |
1088 | | uint32_t rs_size, |
1089 | | int32_t rs_zero, |
1090 | | const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; |
1091 | | |
1092 | | llm_graph_input_rs * build_rs_inp() const; |
1093 | | |
1094 | | ggml_tensor * build_rs( |
1095 | | llm_graph_input_rs * inp, |
1096 | | ggml_tensor * s, |
1097 | | int32_t state_size, |
1098 | | int32_t n_seqs, |
1099 | | const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; |
1100 | | |
1101 | | ggml_tensor * build_rwkv_token_shift_load( |
1102 | | llm_graph_input_rs * inp, |
1103 | | const llama_ubatch & ubatch, |
1104 | | int il) const; |
1105 | | |
1106 | | ggml_tensor * build_rwkv_token_shift_store( |
1107 | | ggml_tensor * token_shift, |
1108 | | const llama_ubatch & ubatch, |
1109 | | int il) const; |
1110 | | // |
1111 | | // hybrid |
1112 | | // |
1113 | | |
1114 | | llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; |
1115 | | llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const; |
1116 | | |
1117 | | llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const; |
1118 | | |
1119 | | // |
1120 | | // pooling |
1121 | | // |
1122 | | |
1123 | | void build_pooling( |
1124 | | ggml_tensor * cls, |
1125 | | ggml_tensor * cls_b, |
1126 | | ggml_tensor * cls_out, |
1127 | | ggml_tensor * cls_out_b, |
1128 | | ggml_tensor * cls_norm) const; |
1129 | | |
1130 | | // |
1131 | | // sampling (backend sampling) |
1132 | | // |
1133 | | |
1134 | | void build_sampling() const; |
1135 | | |
1136 | | // |
1137 | | // dense (out) |
1138 | | // |
1139 | | |
1140 | | void build_dense_out( |
1141 | | ggml_tensor * dense_2, |
1142 | | ggml_tensor * dense_2_b, |
1143 | | ggml_tensor * dense_3) const; |
1144 | | }; |
1145 | | |
1146 | | // TODO: better name |
1147 | | int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional); |