/src/llama.cpp/src/models/eagle3.cpp

Source
#include "models.h"

void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

    if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
        throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
    }
    if (target_layer_ids.size() != 3) {
        throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
    }
    LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
            target_layer_ids[0],
            target_layer_ids[1],
            target_layer_ids[2]);

    uint32_t n_embd_tgt = 0;

    ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
    LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);

    hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;

    // eagle3 norm_before_residual (optional, default false)
    // compatible with Readhat eagle3 speculator model
    ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
    if (hparams.norm_before_residual) {
        LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
    }

    type = LLM_TYPE_UNKNOWN;
}

void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
    LLAMA_LOAD_LOCALS;

    const int64_t n_embd_inp = hparams.n_embd_inp_enc();
    const int64_t n_embd_attn_input = 2 * n_embd;

    // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
    // d2t: draft to target vocabulary mapping
    int64_t n_draft_vocab = n_vocab;  // Default: same as target vocab
    const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
    if (d2t_meta) {
        n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
        d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
        LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
    } else {
        d2t = nullptr; // no d2t, use default vocab size
        LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
    }

    // Feature fusion layer: projects 3 target layers to draft hidden size
    fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);

    // Output layer (uses draft vocab size)
    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);

    // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
    const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
    if (tok_embd_meta) {
        const int64_t n_target_vocab = tok_embd_meta->ne[1];
        tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
        LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
    }

    // Single decoder layer
    for (int i = 0; i < n_layer; ++i) {
        auto & layer = layers[i];

        // input_layernorm: applied to token embeddings
        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);

        // eagle3 specific: hidden_norm applied to fused target features
        layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);

        // Attention takes input_embeds_normed + fused_target_normed as input
        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);

        // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
    }
}

std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
    switch (params.gtype) {
        case LLM_GRAPH_TYPE_ENCODER:
            return std::make_unique<graph<true>>(*this, params);
        case LLM_GRAPH_TYPE_DEFAULT:
        case LLM_GRAPH_TYPE_DECODER:
            return std::make_unique<graph<false>>(*this, params);
        default:
            GGML_ABORT("invalid graph type");
    };
}

template <>
ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
    ggml_tensor * cur = nullptr;

    // Input: Target model features (3 layers concatenated: low, mid, high)
    // Data will be provided via ubatch->embd in encode_eagle3_features()
    auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc());
    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens);
    ggml_set_input(inp_target->embd);

    cur = inp_target->embd;
    cb(cur, "inp_embd", -1);

    res->add_input(std::move(inp_target));

    return cur;
}

// eagle3 Encoder: processes target model features through feature fusion layer
// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
// Output: g_embeddings e.g. [4096, n_tokens] stored in context
template <>
llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    ggml_tensor * cur = nullptr;

    cur = build_inp_embd_enc();

    // Feature fusion layer
    cur = build_lora_mm(model.fc, cur);
    cb(cur, "fc_out", -1);

    // Output: g_embeddings e.g. [4096, n_tokens]
    // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
    ggml_set_output(cur);
    res->t_h_nextn = cur;

    ggml_build_forward_expand(gf, cur);
}

// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
// Input: draft tokens + g_embeddings from encoder
// Output: draft logits
template <>
llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v();

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
    GGML_ASSERT(n_layer == 1);  // eagle3 has only one decoder layer

    ggml_tensor * cur;
    ggml_tensor * inpL;

    // eagle3 Decoder receives:
    // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
    // 2. g_embeddings from encoder
    auto * tok_embd = model.tok_embd;
    if (model.tok_embd == nullptr) {
        GGML_ASSERT(cparams.ctx_other != nullptr);
        const auto * model_other = llama_get_model(cparams.ctx_other);

        GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
        tok_embd = model_other->tok_embd;
    }

    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);

    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
    ggml_set_input(inp->tokens);

    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
    ggml_set_input(inp->embd);

    ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
    cb(inp_embd, "inp_embd", -1);

    ggml_tensor * inp_g = inp->embd;
    cb(inp_g, "inp_g_embeddings", -1);

    res->add_input(std::move(inp));

    inpL = inp_g;

    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

    auto * inp_attn = build_attn_inp_kv();

    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));

    // Single decoder layer (il = 0)
    const int il = 0;
    {
        // Apply input_layernorm to the token embeddings
        ggml_tensor * embd_norm = build_norm(inp_embd,
                model.layers[il].attn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(embd_norm, "embd_norm", il);

        // Apply hidden_norm to inp_g
        ggml_tensor * g_norm = build_norm(inp_g,
                model.layers[il].attn_norm_2, NULL,
                LLM_NORM_RMS, -1);
        cb(g_norm, "g_norm", il);

        // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
        // - false (default): use raw inp_g for residual
        // - true: use normalized g_norm for residual
        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
        ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;

        // Concatenate normalized inp_embd and normalized inp_g
        cur = ggml_concat(ctx0, embd_norm, g_norm, il);
        cb(cur, "concat_embd", il);

        // Self-attention with concatenated input
        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
        cb(Qcur, "Qcur", il);

        ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
        cb(Kcur, "Kcur", il);

        ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
        cb(Vcur, "Vcur", il);

        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

        // rope freq factors, returns nullptr if not available
        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

        // RoPE
        Qcur = ggml_rope_ext(
                ctx0, Qcur, inp_pos, rope_factors,
                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        Kcur = ggml_rope_ext(
                ctx0, Kcur, inp_pos, rope_factors,
                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );

        cb(Qcur, "Qcur_rope", il);
        cb(Kcur, "Kcur_rope", il);

        cur = build_attn(inp_attn,
                model.layers[il].wo, NULL, nullptr,
                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);

        // Add residual and update it
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);

        // Apply FFN norm to the sum
        cur = build_norm(ffn_inp,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "post_attn_norm", il);

        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                model.layers[il].ffn_gate, NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_SILU, LLM_FFN_PAR, il);
        cb(cur, "ffn_out", il);

        // Output norm with residual
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "eagle3_prenorm", il);

        inpL = cur;
    }

    cur = inpL;

    // Output prenorm state (for next token's g_embeddings in autoregressive generation)
    ggml_set_output(cur);
    res->t_h_nextn = cur;

    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);

    // lm_head - projects to draft vocabulary
    // if the draft has no own output projection, inherit the target model's lm_head
    auto * output = model.output;
    if (output == nullptr) {
        GGML_ASSERT(cparams.ctx_other != nullptr);
        const auto * model_other = llama_get_model(cparams.ctx_other);

        GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
        output = model_other->output;
    }
    cur = build_lora_mm(output, cur);

    if (model.d2t) {
        const int64_t n_draft_vocab = cur->ne[0];
        const int64_t n_outputs     = cur->ne[1];
        const int64_t n_vocab       = (int64_t) model.vocab.n_tokens();

        GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
        GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);

        ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
        cur = ggml_set_rows(ctx0, logits,
                ggml_reshape_3d(ctx0, cur,       1,             n_draft_vocab, n_outputs),
                ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1,             1));
        cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
    }

    cb(cur, "result_output", -1);
    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
}

Coverage Report

Created: 2026-06-22 06:47

Line	Count	Source
1		#include "models.h"
2
3	0	void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
4	0	ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5
6	0	if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
7	0	throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
8	0	}
9	0	if (target_layer_ids.size() != 3) {
10	0	throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
11	0	}
12	0	LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
13	0	target_layer_ids[0],
14	0	target_layer_ids[1],
15	0	target_layer_ids[2]);
16
17	0	uint32_t n_embd_tgt = 0;
18
19	0	ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
20	0	LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
21
22	0	hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
23
24		// eagle3 norm_before_residual (optional, default false)
25		// compatible with Readhat eagle3 speculator model
26	0	ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
27	0	if (hparams.norm_before_residual) {
28	0	LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
29	0	}
30
31	0	type = LLM_TYPE_UNKNOWN;
32	0	}
33
34	0	void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
35	0	LLAMA_LOAD_LOCALS;
36
37	0	const int64_t n_embd_inp = hparams.n_embd_inp_enc();
38	0	const int64_t n_embd_attn_input = 2 * n_embd;
39
40		// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
41		// d2t: draft to target vocabulary mapping
42	0	int64_t n_draft_vocab = n_vocab; // Default: same as target vocab
43	0	const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
44	0	if (d2t_meta) {
45	0	n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
46	0	d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
47	0	LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
48	0	} else {
49	0	d2t = nullptr; // no d2t, use default vocab size
50	0	LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
51	0	}
52
53		// Feature fusion layer: projects 3 target layers to draft hidden size
54	0	fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
55
56		// Output layer (uses draft vocab size)
57	0	output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
58	0	output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
59
60		// Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
61	0	const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
62	0	if (tok_embd_meta) {
63	0	const int64_t n_target_vocab = tok_embd_meta->ne[1];
64	0	tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
65	0	LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
66	0	}
67
68		// Single decoder layer
69	0	for (int i = 0; i < n_layer; ++i) {
70	0	auto & layer = layers[i];
71
72		// input_layernorm: applied to token embeddings
73	0	layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
74
75		// eagle3 specific: hidden_norm applied to fused target features
76	0	layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
77
78		// Attention takes input_embeds_normed + fused_target_normed as input
79	0	layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
80	0	layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
81	0	layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
82	0	layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
83
84	0	layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
85	0	layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
86	0	layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
87	0	layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
88
89		// rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
90	0	layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
91	0	}
92	0	}
93
94	0	std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
95	0	switch (params.gtype) {
96	0	case LLM_GRAPH_TYPE_ENCODER:
97	0	return std::make_unique<graph<true>>(*this, params);
98	0	case LLM_GRAPH_TYPE_DEFAULT:
99	0	case LLM_GRAPH_TYPE_DECODER:
100	0	return std::make_unique<graph<false>>(*this, params);
101	0	default:
102	0	GGML_ABORT("invalid graph type");
103	0	};
104	0	}
105
106		template <>
107	0	ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
108	0	ggml_tensor * cur = nullptr;
109
110		// Input: Target model features (3 layers concatenated: low, mid, high)
111		// Data will be provided via ubatch->embd in encode_eagle3_features()
112	0	auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp_enc());
113	0	inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens);
114	0	ggml_set_input(inp_target->embd);
115
116	0	cur = inp_target->embd;
117	0	cb(cur, "inp_embd", -1);
118
119	0	res->add_input(std::move(inp_target));
120
121	0	return cur;
122	0	}
123
124		// eagle3 Encoder: processes target model features through feature fusion layer
125		// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
126		// Output: g_embeddings e.g. [4096, n_tokens] stored in context
127		template <>
128	0	llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
129	0	ggml_tensor * cur = nullptr;
130
131	0	cur = build_inp_embd_enc();
132
133		// Feature fusion layer
134	0	cur = build_lora_mm(model.fc, cur);
135	0	cb(cur, "fc_out", -1);
136
137		// Output: g_embeddings e.g. [4096, n_tokens]
138		// store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
139	0	ggml_set_output(cur);
140	0	res->t_h_nextn = cur;
141
142	0	ggml_build_forward_expand(gf, cur);
143	0	}
144
145		// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
146		// Input: draft tokens + g_embeddings from encoder
147		// Output: draft logits
148		template <>
149	0	llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
150	0	const int64_t n_embd_head = hparams.n_embd_head_v();
151
152	0	GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
153	0	GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer
154
155	0	ggml_tensor * cur;
156	0	ggml_tensor * inpL;
157
158		// eagle3 Decoder receives:
159		// 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
160		// 2. g_embeddings from encoder
161	0	auto * tok_embd = model.tok_embd;
162	0	if (model.tok_embd == nullptr) {
163	0	GGML_ASSERT(cparams.ctx_other != nullptr);
164	0	const auto * model_other = llama_get_model(cparams.ctx_other);
165
166	0	GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
167	0	tok_embd = model_other->tok_embd;
168	0	}
169
170	0	auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
171
172	0	inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
173	0	ggml_set_input(inp->tokens);
174
175	0	inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
176	0	ggml_set_input(inp->embd);
177
178	0	ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
179	0	cb(inp_embd, "inp_embd", -1);
180
181	0	ggml_tensor * inp_g = inp->embd;
182	0	cb(inp_g, "inp_g_embeddings", -1);
183
184	0	res->add_input(std::move(inp));
185
186	0	inpL = inp_g;
187
188		// inp_pos - contains the positions
189	0	ggml_tensor * inp_pos = build_inp_pos();
190
191	0	auto * inp_attn = build_attn_inp_kv();
192
193	0	const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
194
195		// Single decoder layer (il = 0)
196	0	const int il = 0;
197	0	{
198		// Apply input_layernorm to the token embeddings
199	0	ggml_tensor * embd_norm = build_norm(inp_embd,
200	0	model.layers[il].attn_norm, NULL,
201	0	LLM_NORM_RMS, il);
202	0	cb(embd_norm, "embd_norm", il);
203
204		// Apply hidden_norm to inp_g
205	0	ggml_tensor * g_norm = build_norm(inp_g,
206	0	model.layers[il].attn_norm_2, NULL,
207	0	LLM_NORM_RMS, -1);
208	0	cb(g_norm, "g_norm", il);
209
210		// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
211		// - false (default): use raw inp_g for residual
212		// - true: use normalized g_norm for residual
213		// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
214	0	ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
215
216		// Concatenate normalized inp_embd and normalized inp_g
217	0	cur = ggml_concat(ctx0, embd_norm, g_norm, il);
218	0	cb(cur, "concat_embd", il);
219
220		// Self-attention with concatenated input
221	0	ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
222	0	cb(Qcur, "Qcur", il);
223
224	0	ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
225	0	cb(Kcur, "Kcur", il);
226
227	0	ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
228	0	cb(Vcur, "Vcur", il);
229
230	0	Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
231	0	Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
232	0	Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
233
234		// rope freq factors, returns nullptr if not available
235	0	ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
236
237		// RoPE
238	0	Qcur = ggml_rope_ext(
239	0	ctx0, Qcur, inp_pos, rope_factors,
240	0	n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
241	0	ext_factor, attn_factor, beta_fast, beta_slow
242	0	);
243	0	Kcur = ggml_rope_ext(
244	0	ctx0, Kcur, inp_pos, rope_factors,
245	0	n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
246	0	ext_factor, attn_factor, beta_fast, beta_slow
247	0	);
248
249	0	cb(Qcur, "Qcur_rope", il);
250	0	cb(Kcur, "Kcur_rope", il);
251
252	0	cur = build_attn(inp_attn,
253	0	model.layers[il].wo, NULL, nullptr,
254	0	Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
255
256		// Add residual and update it
257	0	ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
258	0	cb(ffn_inp, "ffn_inp", il);
259
260		// Apply FFN norm to the sum
261	0	cur = build_norm(ffn_inp,
262	0	model.layers[il].ffn_norm, NULL,
263	0	LLM_NORM_RMS, il);
264	0	cb(cur, "post_attn_norm", il);
265
266	0	cur = build_ffn(cur,
267	0	model.layers[il].ffn_up, NULL, NULL,
268	0	model.layers[il].ffn_gate, NULL, NULL,
269	0	model.layers[il].ffn_down, NULL, NULL,
270	0	NULL,
271	0	LLM_FFN_SILU, LLM_FFN_PAR, il);
272	0	cb(cur, "ffn_out", il);
273
274		// Output norm with residual
275	0	cur = ggml_add(ctx0, cur, ffn_inp);
276	0	cb(cur, "eagle3_prenorm", il);
277
278	0	inpL = cur;
279	0	}
280
281	0	cur = inpL;
282
283		// Output prenorm state (for next token's g_embeddings in autoregressive generation)
284	0	ggml_set_output(cur);
285	0	res->t_h_nextn = cur;
286
287	0	cur = build_norm(cur,
288	0	model.output_norm, NULL,
289	0	LLM_NORM_RMS, -1);
290	0	cb(cur, "result_norm", -1);
291
292		// lm_head - projects to draft vocabulary
293		// if the draft has no own output projection, inherit the target model's lm_head
294	0	auto * output = model.output;
295	0	if (output == nullptr) {
296	0	GGML_ASSERT(cparams.ctx_other != nullptr);
297	0	const auto * model_other = llama_get_model(cparams.ctx_other);
298
299	0	GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
300	0	output = model_other->output;
301	0	}
302	0	cur = build_lora_mm(output, cur);
303
304	0	if (model.d2t) {
305	0	const int64_t n_draft_vocab = cur->ne[0];
306	0	const int64_t n_outputs = cur->ne[1];
307	0	const int64_t n_vocab = (int64_t) model.vocab.n_tokens();
308
309	0	GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
310	0	GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
311
312	0	ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
313	0	cur = ggml_set_rows(ctx0, logits,
314	0	ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs),
315	0	ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1));
316	0	cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
317	0	}
318
319	0	cb(cur, "result_output", -1);
320	0	res->t_logits = cur;
321
322	0	ggml_build_forward_expand(gf, cur);
323	0	}