/src/llama.cpp/src/models/granite.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | | #include <sstream> |
4 | | |
5 | 0 | void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { |
6 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
7 | 0 | ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); |
8 | 0 | ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false); |
9 | 0 | ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false); |
10 | 0 | ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false); |
11 | | |
12 | | // Granite4 Vision uses array deepstack_mapping |
13 | 0 | ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false); |
14 | | |
15 | | // Count the unique deepstack input indices |
16 | 0 | std::unordered_set<uint32_t> unique_deepstack_idxs; |
17 | 0 | for (const auto val : hparams.deepstack_mapping_arr) { |
18 | 0 | if (val >= 0) { |
19 | 0 | unique_deepstack_idxs.insert(val); |
20 | 0 | } |
21 | 0 | } |
22 | 0 | hparams.n_deepstack_layers = unique_deepstack_idxs.size(); |
23 | | |
24 | | // Ensure all values are valid (avoid overflow attacks) |
25 | 0 | for (const auto val : unique_deepstack_idxs) { |
26 | 0 | if (val > hparams.n_deepstack_layers) { |
27 | 0 | std::stringstream ss; |
28 | 0 | ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers; |
29 | 0 | throw std::runtime_error(ss.str()); |
30 | 0 | } |
31 | 0 | } |
32 | | |
33 | | // Granite uses rope_finetuned as a switch for rope, so default to true |
34 | 0 | bool rope_finetuned = true; |
35 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); |
36 | 0 | hparams.rope_finetuned = rope_finetuned; |
37 | |
|
38 | 0 | switch (hparams.n_layer()) { |
39 | 0 | case 32: type = LLM_TYPE_3B; break; |
40 | 0 | case 40: type = LLM_TYPE_3B; break; |
41 | | // Add additional layer/vocab/etc checks here for other model sizes |
42 | 0 | default: type = LLM_TYPE_UNKNOWN; |
43 | 0 | } |
44 | | |
45 | | // For Granite MoE Shared |
46 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false); |
47 | 0 | } |
48 | | |
49 | 0 | void llama_model_granite::load_arch_tensors(llama_model_loader &) { |
50 | 0 | LLAMA_LOAD_LOCALS; |
51 | |
|
52 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); |
53 | | |
54 | | // output |
55 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); |
56 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); |
57 | | |
58 | | // if output is NULL, init from the input tok embed |
59 | 0 | if (output == NULL) { |
60 | 0 | output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); |
61 | 0 | } |
62 | |
|
63 | 0 | for (int i = 0; i < n_layer; ++i) { |
64 | 0 | auto & layer = layers[i]; |
65 | |
|
66 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); |
67 | |
|
68 | 0 | create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0); |
69 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); |
70 | | |
71 | | // optional bias tensors |
72 | 0 | layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); |
73 | |
|
74 | 0 | layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); |
75 | |
|
76 | 0 | if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { |
77 | 0 | layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
78 | 0 | layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
79 | 0 | } |
80 | 0 | else { |
81 | 0 | layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
82 | 0 | } |
83 | |
|
84 | 0 | if (n_expert == 0) { |
85 | 0 | layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); |
86 | 0 | layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); |
87 | 0 | layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); |
88 | | |
89 | | // optional MLP bias |
90 | 0 | layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); |
91 | 0 | layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); |
92 | 0 | layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); |
93 | 0 | } else { |
94 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); |
95 | 0 | layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED); |
96 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); |
97 | 0 | layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); |
98 | | |
99 | | // For Granite MoE Shared |
100 | 0 | if (hparams.n_ff_shexp > 0) { |
101 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); |
102 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); |
103 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0); |
104 | 0 | } |
105 | 0 | } |
106 | 0 | } |
107 | 0 | } |
108 | | |
109 | 0 | std::unique_ptr<llm_graph_context> llama_model_granite::build_arch_graph(const llm_graph_params & params) const { |
110 | 0 | return std::make_unique<graph>(*this, params); |
111 | 0 | } |
112 | | |
113 | | llama_model_granite::graph::graph( |
114 | | const llama_model & model, |
115 | | const llm_graph_params & params) |
116 | 0 | : llm_graph_context(params) { |
117 | |
|
118 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
119 | |
|
120 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
121 | 0 | GGML_ASSERT(n_embd_head == n_rot); |
122 | |
|
123 | 0 | ggml_tensor * cur; |
124 | 0 | ggml_tensor * inpL; |
125 | |
|
126 | 0 | inpL = build_inp_embd(model.tok_embd); |
127 | | |
128 | | // inp_pos - built only if rope enabled |
129 | 0 | ggml_tensor * inp_pos = nullptr; |
130 | 0 | if (hparams.rope_finetuned) { |
131 | 0 | inp_pos = build_inp_pos(); |
132 | 0 | } |
133 | 0 | auto * inp_attn = build_attn_inp_kv(); |
134 | |
|
135 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
136 | |
|
137 | 0 | for (int il = 0; il < n_layer; ++il) { |
138 | | |
139 | | // Granite Vision 4.1 deepstack: inject the projector stream that |
140 | | // targets decoder layer `il` before the decoder runs. |
141 | | // NOTE: skip the first deepstack layer since that's inpL |
142 | 0 | const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il]; |
143 | 0 | if (il > 0 && deepstack_emb_idx >= 0) { |
144 | 0 | ggml_tensor * ds = ggml_view_2d(ctx0, |
145 | 0 | res->t_inp_embd, n_embd, n_tokens, |
146 | 0 | res->t_inp_embd->nb[1], |
147 | 0 | deepstack_emb_idx * n_embd * sizeof(float)); |
148 | 0 | inpL = ggml_add(ctx0, inpL, ds); |
149 | 0 | cb(inpL, "deepstack_in", il); |
150 | 0 | } |
151 | |
|
152 | 0 | ggml_tensor * inpSA = inpL; |
153 | | |
154 | | // norm |
155 | 0 | cur = build_norm(inpL, |
156 | 0 | model.layers[il].attn_norm, NULL, |
157 | 0 | LLM_NORM_RMS, il); |
158 | 0 | cb(cur, "attn_norm", il); |
159 | | |
160 | | // self-attention |
161 | 0 | cur = build_attention_layer( |
162 | 0 | cur, inp_pos, inp_attn, |
163 | 0 | model, n_embd_head, il); |
164 | |
|
165 | 0 | if (il == n_layer - 1 && inp_out_ids) { |
166 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
167 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
168 | 0 | } |
169 | | // ffn |
170 | 0 | cur = build_layer_ffn(cur, inpSA, model, il); |
171 | | |
172 | | // input for next layer |
173 | 0 | inpL = cur; |
174 | 0 | } |
175 | 0 | cur = inpL; |
176 | |
|
177 | 0 | cur = build_norm(cur, |
178 | 0 | model.output_norm, NULL, |
179 | 0 | LLM_NORM_RMS, -1); |
180 | |
|
181 | 0 | cb(cur, "result_norm", -1); |
182 | 0 | res->t_embd = cur; |
183 | | |
184 | | // lm_head |
185 | 0 | cur = build_lora_mm(model.output, cur, model.output_s); |
186 | | |
187 | | // For Granite architectures - scale logits |
188 | 0 | cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); |
189 | 0 | cb(cur, "result_output", -1); |
190 | 0 | res->t_logits = cur; |
191 | |
|
192 | 0 | ggml_build_forward_expand(gf, cur); |
193 | 0 | } |
194 | | |
195 | | ggml_tensor * llama_model_granite::graph::build_attention_layer( |
196 | | ggml_tensor * cur, |
197 | | ggml_tensor * inp_pos, |
198 | | llm_graph_input_attn_kv * inp_attn, |
199 | | const llama_model & model, |
200 | | const int64_t n_embd_head, |
201 | 0 | const int il) { |
202 | |
|
203 | 0 | auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, |
204 | 0 | n_embd_head, hparams.n_head(il), hparams.n_head_kv(il), il); |
205 | |
|
206 | 0 | const bool use_rope = hparams.rope_finetuned; |
207 | 0 | if (use_rope) { |
208 | 0 | ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); |
209 | 0 | Qcur = ggml_rope_ext( |
210 | 0 | ctx0, Qcur, inp_pos, rope_factors, |
211 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
212 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
213 | 0 | ); |
214 | |
|
215 | 0 | Kcur = ggml_rope_ext( |
216 | 0 | ctx0, Kcur, inp_pos, rope_factors, |
217 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
218 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
219 | 0 | ); |
220 | 0 | } |
221 | |
|
222 | 0 | cb(Qcur, "Qcur", il); |
223 | 0 | cb(Kcur, "Kcur", il); |
224 | 0 | cb(Vcur, "Vcur", il); |
225 | |
|
226 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
227 | 0 | cur = build_attn(inp_attn, |
228 | 0 | model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s, |
229 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
230 | 0 | cb(cur, "attn_out", il); |
231 | 0 | return cur; |
232 | 0 | } |
233 | | |
234 | | ggml_tensor * llama_model_granite::graph::build_layer_ffn( |
235 | | ggml_tensor * cur, |
236 | | ggml_tensor * inpSA, |
237 | | const llama_model & model, |
238 | 0 | const int il) { |
239 | | |
240 | | // For Granite architectures - scale residual |
241 | 0 | if (hparams.f_residual_scale) { |
242 | 0 | cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
243 | 0 | } |
244 | 0 | ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
245 | 0 | cb(ffn_inp, "ffn_inp", il); |
246 | | |
247 | | // feed-forward network (non-MoE) |
248 | 0 | if (model.layers[il].ffn_gate_inp == nullptr) { |
249 | |
|
250 | 0 | cur = build_norm(ffn_inp, |
251 | 0 | model.layers[il].ffn_norm, NULL, |
252 | 0 | LLM_NORM_RMS, il); |
253 | 0 | cb(cur, "ffn_norm", il); |
254 | |
|
255 | 0 | cur = build_ffn(cur, |
256 | 0 | model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
257 | 0 | model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, |
258 | 0 | model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
259 | 0 | NULL, |
260 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
261 | 0 | cb(cur, "ffn_out", il); |
262 | |
|
263 | 0 | } else { |
264 | | // MoE branch |
265 | 0 | cur = build_norm(ffn_inp, |
266 | 0 | model.layers[il].ffn_norm, NULL, |
267 | 0 | LLM_NORM_RMS, il); |
268 | 0 | cb(cur, "ffn_norm", il); |
269 | |
|
270 | 0 | ggml_tensor * moe_out = build_moe_ffn(cur, |
271 | 0 | model.layers[il].ffn_gate_inp, |
272 | 0 | model.layers[il].ffn_up_exps, |
273 | 0 | model.layers[il].ffn_gate_exps, |
274 | 0 | model.layers[il].ffn_down_exps, |
275 | 0 | nullptr, |
276 | 0 | n_expert, n_expert_used, |
277 | 0 | LLM_FFN_SILU, true, |
278 | 0 | hparams.expert_weights_scale, |
279 | 0 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
280 | 0 | il); |
281 | 0 | cb(moe_out, "ffn_moe_out", il); |
282 | | |
283 | | // For Granite MoE Shared |
284 | 0 | if (hparams.n_ff_shexp > 0) { |
285 | 0 | ggml_tensor * ffn_shexp = build_ffn(cur, |
286 | 0 | model.layers[il].ffn_up_shexp, NULL, NULL, |
287 | 0 | model.layers[il].ffn_gate_shexp, NULL, NULL, |
288 | 0 | model.layers[il].ffn_down_shexp, NULL, NULL, |
289 | 0 | NULL, |
290 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
291 | 0 | cb(ffn_shexp, "ffn_shexp", il); |
292 | |
|
293 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
294 | 0 | cb(cur, "ffn_out", il); |
295 | 0 | } else { |
296 | 0 | cur = moe_out; |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | // For Granite architectures - scale residual |
301 | 0 | if (hparams.f_residual_scale) { |
302 | 0 | cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
303 | 0 | } |
304 | 0 | cur = ggml_add(ctx0, cur, ffn_inp); |
305 | 0 | cb(cur, "ffn_out", il); |
306 | |
|
307 | 0 | cur = build_cvec(cur, il); |
308 | 0 | cb(cur, "l_out", il); |
309 | |
|
310 | 0 | return cur; |
311 | 0 | } |