/src/llama.cpp/src/models/eagle3.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | 0 | void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { |
4 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
5 | |
|
6 | 0 | if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) { |
7 | 0 | throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); |
8 | 0 | } |
9 | 0 | if (target_layer_ids.size() != 3) { |
10 | 0 | throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'"); |
11 | 0 | } |
12 | 0 | LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, |
13 | 0 | target_layer_ids[0], |
14 | 0 | target_layer_ids[1], |
15 | 0 | target_layer_ids[2]); |
16 | |
|
17 | 0 | uint32_t n_embd_tgt = 0; |
18 | |
|
19 | 0 | ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); |
20 | 0 | LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); |
21 | |
|
22 | 0 | hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; |
23 | | |
24 | | // eagle3 norm_before_residual (optional, default false) |
25 | | // compatible with Readhat eagle3 speculator model |
26 | 0 | ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); |
27 | 0 | if (hparams.norm_before_residual) { |
28 | 0 | LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__); |
29 | 0 | } |
30 | |
|
31 | 0 | type = LLM_TYPE_UNKNOWN; |
32 | 0 | } |
33 | | |
34 | 0 | void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { |
35 | 0 | LLAMA_LOAD_LOCALS; |
36 | |
|
37 | 0 | const int64_t n_embd_inp = hparams.n_embd_inp_enc(); |
38 | 0 | const int64_t n_embd_attn_input = 2 * n_embd; |
39 | | |
40 | | // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) |
41 | | // d2t: draft to target vocabulary mapping |
42 | 0 | int64_t n_draft_vocab = n_vocab; // Default: same as target vocab |
43 | 0 | const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t"); |
44 | 0 | if (d2t_meta) { |
45 | 0 | n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size |
46 | 0 | d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0); |
47 | 0 | LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); |
48 | 0 | } else { |
49 | 0 | d2t = nullptr; // no d2t, use default vocab size |
50 | 0 | LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); |
51 | 0 | } |
52 | | |
53 | | // Feature fusion layer: projects 3 target layers to draft hidden size |
54 | 0 | fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0); |
55 | | |
56 | | // Output layer (uses draft vocab size) |
57 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); |
58 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED); |
59 | | |
60 | | // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) |
61 | 0 | const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); |
62 | 0 | if (tok_embd_meta) { |
63 | 0 | const int64_t n_target_vocab = tok_embd_meta->ne[1]; |
64 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); |
65 | 0 | LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); |
66 | 0 | } |
67 | | |
68 | | // Single decoder layer |
69 | 0 | for (int i = 0; i < n_layer; ++i) { |
70 | 0 | auto & layer = layers[i]; |
71 | | |
72 | | // input_layernorm: applied to token embeddings |
73 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); |
74 | | |
75 | | // eagle3 specific: hidden_norm applied to fused target features |
76 | 0 | layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0); |
77 | | |
78 | | // Attention takes input_embeds_normed + fused_target_normed as input |
79 | 0 | layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); |
80 | 0 | layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); |
81 | 0 | layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); |
82 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); |
83 | |
|
84 | 0 | layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); |
85 | 0 | layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); |
86 | 0 | layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); |
87 | 0 | layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); |
88 | | |
89 | | // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling) |
90 | 0 | layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); |
91 | 0 | } |
92 | 0 | } |
93 | | |
94 | 0 | std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const { |
95 | 0 | switch (params.gtype) { |
96 | 0 | case LLM_GRAPH_TYPE_ENCODER: |
97 | 0 | return std::make_unique<graph<true>>(*this, params); |
98 | 0 | case LLM_GRAPH_TYPE_DEFAULT: |
99 | 0 | case LLM_GRAPH_TYPE_DECODER: |
100 | 0 | return std::make_unique<graph<false>>(*this, params); |
101 | 0 | default: |
102 | 0 | GGML_ABORT("invalid graph type"); |
103 | 0 | }; |
104 | 0 | } |
105 | | |
106 | | template <> |
107 | 0 | ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const { |
108 | 0 | ggml_tensor * cur = nullptr; |
109 | | |
110 | | // Input: Target model features (3 layers concatenated: low, mid, high) |
111 | | // Data will be provided via ubatch->embd in encode_eagle3_features() |
112 | 0 | auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc()); |
113 | 0 | inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens); |
114 | 0 | ggml_set_input(inp_target->embd); |
115 | |
|
116 | 0 | cur = inp_target->embd; |
117 | 0 | cb(cur, "inp_embd", -1); |
118 | |
|
119 | 0 | res->add_input(std::move(inp_target)); |
120 | |
|
121 | 0 | return cur; |
122 | 0 | } |
123 | | |
124 | | // eagle3 Encoder: processes target model features through feature fusion layer |
125 | | // Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high |
126 | | // Output: g_embeddings e.g. [4096, n_tokens] stored in context |
127 | | template <> |
128 | 0 | llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { |
129 | 0 | ggml_tensor * cur = nullptr; |
130 | |
|
131 | 0 | cur = build_inp_embd_enc(); |
132 | | |
133 | | // Feature fusion layer |
134 | 0 | cur = build_lora_mm(model.fc, cur); |
135 | 0 | cb(cur, "fc_out", -1); |
136 | | |
137 | | // Output: g_embeddings e.g. [4096, n_tokens] |
138 | | // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft) |
139 | 0 | ggml_set_output(cur); |
140 | 0 | res->t_h_nextn = cur; |
141 | |
|
142 | 0 | ggml_build_forward_expand(gf, cur); |
143 | 0 | } |
144 | | |
145 | | // eagle3 Decoder: processes draft tokens using g_embeddings from encoder |
146 | | // Input: draft tokens + g_embeddings from encoder |
147 | | // Output: draft logits |
148 | | template <> |
149 | 0 | llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { |
150 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
151 | |
|
152 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
153 | 0 | GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer |
154 | |
|
155 | 0 | ggml_tensor * cur; |
156 | 0 | ggml_tensor * inpL; |
157 | | |
158 | | // eagle3 Decoder receives: |
159 | | // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) |
160 | | // 2. g_embeddings from encoder |
161 | 0 | auto * tok_embd = model.tok_embd; |
162 | 0 | if (model.tok_embd == nullptr) { |
163 | 0 | GGML_ASSERT(cparams.ctx_other != nullptr); |
164 | 0 | const auto * model_other = llama_get_model(cparams.ctx_other); |
165 | |
|
166 | 0 | GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); |
167 | 0 | tok_embd = model_other->tok_embd; |
168 | 0 | } |
169 | |
|
170 | 0 | auto inp = std::make_unique<llm_graph_input_embd>(n_embd); |
171 | |
|
172 | 0 | inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
173 | 0 | ggml_set_input(inp->tokens); |
174 | |
|
175 | 0 | inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); |
176 | 0 | ggml_set_input(inp->embd); |
177 | |
|
178 | 0 | ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens); |
179 | 0 | cb(inp_embd, "inp_embd", -1); |
180 | |
|
181 | 0 | ggml_tensor * inp_g = inp->embd; |
182 | 0 | cb(inp_g, "inp_g_embeddings", -1); |
183 | |
|
184 | 0 | res->add_input(std::move(inp)); |
185 | |
|
186 | 0 | inpL = inp_g; |
187 | | |
188 | | // inp_pos - contains the positions |
189 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
190 | |
|
191 | 0 | auto * inp_attn = build_attn_inp_kv(); |
192 | |
|
193 | 0 | const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); |
194 | | |
195 | | // Single decoder layer (il = 0) |
196 | 0 | const int il = 0; |
197 | 0 | { |
198 | | // Apply input_layernorm to the token embeddings |
199 | 0 | ggml_tensor * embd_norm = build_norm(inp_embd, |
200 | 0 | model.layers[il].attn_norm, NULL, |
201 | 0 | LLM_NORM_RMS, il); |
202 | 0 | cb(embd_norm, "embd_norm", il); |
203 | | |
204 | | // Apply hidden_norm to inp_g |
205 | 0 | ggml_tensor * g_norm = build_norm(inp_g, |
206 | 0 | model.layers[il].attn_norm_2, NULL, |
207 | 0 | LLM_NORM_RMS, -1); |
208 | 0 | cb(g_norm, "g_norm", il); |
209 | | |
210 | | // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) |
211 | | // - false (default): use raw inp_g for residual |
212 | | // - true: use normalized g_norm for residual |
213 | | // inpL is the concatenated input (normalized inp_embd + normalized inp_g) |
214 | 0 | ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL; |
215 | | |
216 | | // Concatenate normalized inp_embd and normalized inp_g |
217 | 0 | cur = ggml_concat(ctx0, embd_norm, g_norm, il); |
218 | 0 | cb(cur, "concat_embd", il); |
219 | | |
220 | | // Self-attention with concatenated input |
221 | 0 | ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
222 | 0 | cb(Qcur, "Qcur", il); |
223 | |
|
224 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
225 | 0 | cb(Kcur, "Kcur", il); |
226 | |
|
227 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
228 | 0 | cb(Vcur, "Vcur", il); |
229 | |
|
230 | 0 | Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
231 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
232 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
233 | | |
234 | | // rope freq factors, returns nullptr if not available |
235 | 0 | ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); |
236 | | |
237 | | // RoPE |
238 | 0 | Qcur = ggml_rope_ext( |
239 | 0 | ctx0, Qcur, inp_pos, rope_factors, |
240 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
241 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
242 | 0 | ); |
243 | 0 | Kcur = ggml_rope_ext( |
244 | 0 | ctx0, Kcur, inp_pos, rope_factors, |
245 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
246 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
247 | 0 | ); |
248 | |
|
249 | 0 | cb(Qcur, "Qcur_rope", il); |
250 | 0 | cb(Kcur, "Kcur_rope", il); |
251 | |
|
252 | 0 | cur = build_attn(inp_attn, |
253 | 0 | model.layers[il].wo, NULL, nullptr, |
254 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
255 | | |
256 | | // Add residual and update it |
257 | 0 | ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
258 | 0 | cb(ffn_inp, "ffn_inp", il); |
259 | | |
260 | | // Apply FFN norm to the sum |
261 | 0 | cur = build_norm(ffn_inp, |
262 | 0 | model.layers[il].ffn_norm, NULL, |
263 | 0 | LLM_NORM_RMS, il); |
264 | 0 | cb(cur, "post_attn_norm", il); |
265 | |
|
266 | 0 | cur = build_ffn(cur, |
267 | 0 | model.layers[il].ffn_up, NULL, NULL, |
268 | 0 | model.layers[il].ffn_gate, NULL, NULL, |
269 | 0 | model.layers[il].ffn_down, NULL, NULL, |
270 | 0 | NULL, |
271 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
272 | 0 | cb(cur, "ffn_out", il); |
273 | | |
274 | | // Output norm with residual |
275 | 0 | cur = ggml_add(ctx0, cur, ffn_inp); |
276 | 0 | cb(cur, "eagle3_prenorm", il); |
277 | |
|
278 | 0 | inpL = cur; |
279 | 0 | } |
280 | |
|
281 | 0 | cur = inpL; |
282 | | |
283 | | // Output prenorm state (for next token's g_embeddings in autoregressive generation) |
284 | 0 | ggml_set_output(cur); |
285 | 0 | res->t_h_nextn = cur; |
286 | |
|
287 | 0 | cur = build_norm(cur, |
288 | 0 | model.output_norm, NULL, |
289 | 0 | LLM_NORM_RMS, -1); |
290 | 0 | cb(cur, "result_norm", -1); |
291 | | |
292 | | // lm_head - projects to draft vocabulary |
293 | | // if the draft has no own output projection, inherit the target model's lm_head |
294 | 0 | auto * output = model.output; |
295 | 0 | if (output == nullptr) { |
296 | 0 | GGML_ASSERT(cparams.ctx_other != nullptr); |
297 | 0 | const auto * model_other = llama_get_model(cparams.ctx_other); |
298 | |
|
299 | 0 | GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)"); |
300 | 0 | output = model_other->output; |
301 | 0 | } |
302 | 0 | cur = build_lora_mm(output, cur); |
303 | |
|
304 | 0 | if (model.d2t) { |
305 | 0 | const int64_t n_draft_vocab = cur->ne[0]; |
306 | 0 | const int64_t n_outputs = cur->ne[1]; |
307 | 0 | const int64_t n_vocab = (int64_t) model.vocab.n_tokens(); |
308 | |
|
309 | 0 | GGML_ASSERT(model.d2t->type == GGML_TYPE_I64); |
310 | 0 | GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab); |
311 | |
|
312 | 0 | ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY); |
313 | 0 | cur = ggml_set_rows(ctx0, logits, |
314 | 0 | ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs), |
315 | 0 | ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1)); |
316 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs); |
317 | 0 | } |
318 | |
|
319 | 0 | cb(cur, "result_output", -1); |
320 | 0 | res->t_logits = cur; |
321 | |
|
322 | 0 | ggml_build_forward_expand(gf, cur); |
323 | 0 | } |