/src/llama.cpp/src/llama-memory-hybrid-iswa.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama-batch.h" |
4 | | #include "llama-graph.h" |
5 | | #include "llama-kv-cache-iswa.h" |
6 | | #include "llama-memory.h" |
7 | | #include "llama-memory-recurrent.h" |
8 | | |
9 | | #include <memory> |
10 | | #include <vector> |
11 | | |
12 | | // |
13 | | // llama_memory_hybrid_iswa |
14 | | // |
15 | | |
16 | | // utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to |
17 | | // support models where each layer may be either attention-based (with SWA support) or recurrent |
18 | | |
19 | | class llama_memory_hybrid_iswa : public llama_memory_i { |
20 | | public: |
21 | | llama_memory_hybrid_iswa( |
22 | | const llama_model & model, |
23 | | /* attn */ |
24 | | ggml_type type_k, |
25 | | ggml_type type_v, |
26 | | bool v_trans, |
27 | | bool swa_full, |
28 | | uint32_t kv_size, |
29 | | uint32_t n_ubatch, |
30 | | uint32_t n_pad, |
31 | | /* recurrent */ |
32 | | ggml_type type_r, |
33 | | ggml_type type_s, |
34 | | uint32_t rs_size, |
35 | | /* common */ |
36 | | uint32_t n_seq_max, |
37 | | uint32_t n_rs_seq, |
38 | | bool offload, |
39 | | bool unified, |
40 | | /* layer filters */ |
41 | | const layer_filter_cb & filter_attn = nullptr, |
42 | | const layer_filter_cb & filter_recr = nullptr); |
43 | | |
44 | 0 | ~llama_memory_hybrid_iswa() = default; |
45 | | |
46 | | // |
47 | | // llama_memory_i |
48 | | // |
49 | | |
50 | | llama_memory_context_ptr init_batch( |
51 | | llama_batch_allocr & balloc, |
52 | | uint32_t n_ubatch, |
53 | | bool embd_all) override; |
54 | | |
55 | | llama_memory_context_ptr init_full() override; |
56 | | |
57 | | llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; |
58 | | |
59 | | bool get_can_shift() const override; |
60 | | |
61 | | void clear(bool data) override; |
62 | | |
63 | | bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; |
64 | | void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; |
65 | | void seq_keep(llama_seq_id seq_id) override; |
66 | | void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; |
67 | | void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; |
68 | | |
69 | | llama_pos seq_pos_min(llama_seq_id seq_id) const override; |
70 | | llama_pos seq_pos_max(llama_seq_id seq_id) const override; |
71 | | |
72 | | std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override; |
73 | | |
74 | | // state write/load |
75 | | |
76 | | void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; |
77 | | void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override; |
78 | | |
79 | | // |
80 | | // llama_memory_hybrid_iswa specific API |
81 | | // |
82 | | |
83 | | llama_kv_cache_iswa * get_mem_attn() const; |
84 | | llama_memory_recurrent * get_mem_recr() const; |
85 | | |
86 | | private: |
87 | | const llama_hparams & hparams; |
88 | | |
89 | | const std::unique_ptr<llama_kv_cache_iswa> mem_attn; |
90 | | const std::unique_ptr<llama_memory_recurrent> mem_recr; |
91 | | }; |
92 | | |
93 | | class llama_memory_hybrid_iswa_context : public llama_memory_context_i { |
94 | | public: |
95 | | using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; |
96 | | |
97 | | // init failure |
98 | | explicit llama_memory_hybrid_iswa_context(llama_memory_status status); |
99 | | |
100 | | // init full |
101 | | explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem); |
102 | | |
103 | | // init update |
104 | | explicit llama_memory_hybrid_iswa_context( |
105 | | llama_memory_hybrid_iswa * mem, |
106 | | llama_context * lctx, |
107 | | bool optimize); |
108 | | |
109 | | // init success |
110 | | llama_memory_hybrid_iswa_context( |
111 | | llama_memory_hybrid_iswa * mem, |
112 | | slot_info_vec_t sinfos_base, |
113 | | slot_info_vec_t sinfos_swa, |
114 | | std::vector<llama_ubatch> ubatches); |
115 | | |
116 | 0 | ~llama_memory_hybrid_iswa_context() = default; |
117 | | |
118 | | bool next() override; |
119 | | bool apply() override; |
120 | | |
121 | | llama_memory_status get_status() const override; |
122 | | const llama_ubatch & get_ubatch() const override; |
123 | | |
124 | | // |
125 | | // llama_memory_hybrid_iswa_context |
126 | | // |
127 | | |
128 | | const llama_kv_cache_iswa_context * get_attn() const; |
129 | | const llama_memory_recurrent_context * get_recr() const; |
130 | | |
131 | | private: |
132 | | // the index of the next ubatch to process |
133 | | size_t i_next = 0; |
134 | | |
135 | | std::vector<llama_ubatch> ubatches; |
136 | | |
137 | | const llama_memory_context_ptr ctx_attn; |
138 | | const llama_memory_context_ptr ctx_recr; |
139 | | |
140 | | const llama_memory_status status; |
141 | | }; |