/src/llama.cpp/src/llama-kv-cache-iswa.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama-kv-cache.h" |
4 | | |
5 | | #include <vector> |
6 | | |
7 | | // |
8 | | // llama_kv_cache_iswa |
9 | | // |
10 | | |
11 | | // utilizes two instances of llama_kv_cache |
12 | | // the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers |
13 | | |
14 | | class llama_kv_cache_iswa : public llama_memory_i { |
15 | | public: |
16 | | llama_kv_cache_iswa( |
17 | | const llama_model & model, |
18 | | ggml_type type_k, |
19 | | ggml_type type_v, |
20 | | bool v_trans, |
21 | | bool offload, |
22 | | bool swa_full, |
23 | | bool unified, |
24 | | uint32_t kv_size, |
25 | | uint32_t n_seq_max, |
26 | | uint32_t n_ubatch, |
27 | | uint32_t n_pad, |
28 | | llama_memory_t mem_other, |
29 | | const layer_filter_cb & filter, |
30 | | const layer_reuse_cb & reuse, |
31 | | const layer_share_cb & share); |
32 | | |
33 | 0 | ~llama_kv_cache_iswa() = default; |
34 | | |
35 | | // |
36 | | // llama_memory_i |
37 | | // |
38 | | |
39 | | llama_memory_context_ptr init_batch( |
40 | | llama_batch_allocr & balloc, |
41 | | uint32_t n_ubatch, |
42 | | bool embd_all) override; |
43 | | |
44 | | llama_memory_context_ptr init_full() override; |
45 | | |
46 | | llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; |
47 | | |
48 | | bool get_can_shift() const override; |
49 | | |
50 | | void clear(bool data) override; |
51 | | |
52 | | bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; |
53 | | void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; |
54 | | void seq_keep(llama_seq_id seq_id) override; |
55 | | void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; |
56 | | void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; |
57 | | |
58 | | llama_pos seq_pos_min(llama_seq_id seq_id) const override; |
59 | | llama_pos seq_pos_max(llama_seq_id seq_id) const override; |
60 | | |
61 | | std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override; |
62 | | |
63 | | // state write/load |
64 | | |
65 | | void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; |
66 | | void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override; |
67 | | |
68 | | // |
69 | | // llama_kv_cache_iswa specific API |
70 | | // |
71 | | |
72 | | llama_kv_cache * get_base() const; |
73 | | llama_kv_cache * get_swa () const; |
74 | | |
75 | | private: |
76 | | const llama_hparams & hparams; |
77 | | |
78 | | const bool unified; |
79 | | |
80 | | std::unique_ptr<llama_kv_cache> kv_base; |
81 | | std::unique_ptr<llama_kv_cache> kv_swa; |
82 | | }; |
83 | | |
84 | | class llama_kv_cache_iswa_context : public llama_memory_context_i { |
85 | | public: |
86 | | using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; |
87 | | |
88 | | // used for errors |
89 | | llama_kv_cache_iswa_context(llama_memory_status status); |
90 | | |
91 | | // used to create a full-cache context |
92 | | llama_kv_cache_iswa_context( |
93 | | llama_kv_cache_iswa * kv); |
94 | | |
95 | | // used to create an update context |
96 | | llama_kv_cache_iswa_context( |
97 | | llama_kv_cache_iswa * kv, |
98 | | llama_context * lctx, |
99 | | bool optimize); |
100 | | |
101 | | // used to create a batch processing context from a batch |
102 | | llama_kv_cache_iswa_context( |
103 | | llama_kv_cache_iswa * kv, |
104 | | slot_info_vec_t sinfos_base, |
105 | | slot_info_vec_t sinfos_swa, |
106 | | std::vector<llama_ubatch> ubatches); |
107 | | |
108 | | virtual ~llama_kv_cache_iswa_context(); |
109 | | |
110 | | // |
111 | | // llama_memory_context_i |
112 | | // |
113 | | |
114 | | bool next() override; |
115 | | bool apply() override; |
116 | | |
117 | | llama_memory_status get_status() const override; |
118 | | const llama_ubatch & get_ubatch() const override; |
119 | | |
120 | | // |
121 | | // llama_kv_cache_iswa_context specific API |
122 | | // |
123 | | |
124 | | const llama_kv_cache_context * get_base() const; |
125 | | const llama_kv_cache_context * get_swa() const; |
126 | | |
127 | | private: |
128 | | //llama_kv_cache_iswa * kv; |
129 | | |
130 | | // the index of the next ubatch to process |
131 | | size_t i_next = 0; |
132 | | |
133 | | std::vector<llama_ubatch> ubatches; |
134 | | |
135 | | const llama_memory_context_ptr ctx_base; |
136 | | const llama_memory_context_ptr ctx_swa; |
137 | | |
138 | | const llama_memory_status status; |
139 | | }; |