/src/llama.cpp/src/models/glm-dsa.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | 0 | void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) { |
4 | 0 | ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); |
5 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
6 | 0 | ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); |
7 | | |
8 | | // MoE parameters |
9 | 0 | ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); |
10 | 0 | ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); |
11 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); |
12 | 0 | ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); |
13 | 0 | ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); |
14 | 0 | ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); |
15 | | |
16 | | // deepseek MLA parameters |
17 | 0 | ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); |
18 | 0 | ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); |
19 | 0 | ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false); |
20 | 0 | ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false); |
21 | 0 | ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); |
22 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); |
23 | | |
24 | | // DSA parameters |
25 | 0 | ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); |
26 | 0 | ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); |
27 | 0 | ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); |
28 | | |
29 | | // Expert gating function (GLM-4.5 uses sigmoid) |
30 | 0 | ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); |
31 | 0 | if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { |
32 | 0 | hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; |
33 | 0 | } |
34 | | |
35 | | // NextN/MTP parameters |
36 | 0 | ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); |
37 | 0 | GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); |
38 | |
|
39 | 0 | switch (hparams.n_layer()) { |
40 | 0 | case 79: type = LLM_TYPE_744B_A40B; break; |
41 | 0 | default: type = LLM_TYPE_UNKNOWN; |
42 | 0 | } |
43 | 0 | } |
44 | | |
45 | 0 | void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { |
46 | 0 | LLAMA_LOAD_LOCALS; |
47 | 0 | const int64_t n_expert_shared = hparams.n_expert_shared; |
48 | |
|
49 | 0 | const bool is_mla = hparams.is_mla(); |
50 | 0 | if (!is_mla) { |
51 | 0 | throw std::runtime_error("GLM_DSA architecture requires MLA"); |
52 | 0 | } |
53 | | |
54 | | // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA |
55 | 0 | const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); |
56 | 0 | const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); |
57 | |
|
58 | 0 | const int64_t n_embd_head_qk_rope = hparams.n_rot(); |
59 | 0 | const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; |
60 | |
|
61 | 0 | const int64_t q_lora_rank = hparams.n_lora_q; |
62 | 0 | const int64_t kv_lora_rank = hparams.n_lora_kv; |
63 | |
|
64 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp; |
65 | |
|
66 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); |
67 | | |
68 | | // output |
69 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); |
70 | | // try to load output.weight, if not found, use token_embd (tied embeddings) |
71 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); |
72 | 0 | if (!output) { |
73 | 0 | output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); |
74 | 0 | } |
75 | |
|
76 | 0 | for (int i = 0; i < n_layer_all; ++i) { |
77 | 0 | int flags = 0; |
78 | 0 | if (i >= n_layer) { |
79 | | // skip all tensors in the NextN layers |
80 | | // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later |
81 | 0 | flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; |
82 | 0 | } |
83 | |
|
84 | 0 | auto & layer = layers[i]; |
85 | |
|
86 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); |
87 | 0 | layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags); |
88 | 0 | layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags); |
89 | |
|
90 | 0 | layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags); |
91 | 0 | layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags); |
92 | |
|
93 | 0 | layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags); |
94 | | |
95 | | // note: only old legacy GGUF files will have the unsplit wkv_b tensor in |
96 | 0 | layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags); |
97 | 0 | layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags); |
98 | |
|
99 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags); |
100 | |
|
101 | 0 | layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); |
102 | | |
103 | | // DSA indexer |
104 | 0 | layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags); |
105 | 0 | layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags); |
106 | 0 | layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags); |
107 | 0 | layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags); |
108 | 0 | layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags); |
109 | 0 | if (i < (int) hparams.n_layer_dense_lead) { |
110 | 0 | layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); |
111 | 0 | layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); |
112 | 0 | layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); |
113 | 0 | } else { |
114 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); |
115 | 0 | layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); |
116 | |
|
117 | 0 | if (n_expert == 0) { |
118 | 0 | throw std::runtime_error("n_expert must be > 0"); |
119 | 0 | } |
120 | 0 | if (n_expert_used == 0) { |
121 | 0 | throw std::runtime_error("n_expert_used must be > 0"); |
122 | 0 | } |
123 | | |
124 | | // MoE branch |
125 | 0 | layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); |
126 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); |
127 | 0 | layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); |
128 | | |
129 | | // Shared expert branch |
130 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); |
131 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags); |
132 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); |
133 | 0 | } |
134 | | |
135 | | // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn |
136 | 0 | if (i >= n_layer) { |
137 | 0 | layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); |
138 | 0 | layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); |
139 | 0 | layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); |
140 | | |
141 | | // Optional tensors |
142 | 0 | layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); |
143 | 0 | layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); |
144 | 0 | layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED); |
145 | 0 | } |
146 | 0 | } |
147 | 0 | } |
148 | | |
149 | 0 | std::unique_ptr<llm_graph_context> llama_model_glm_dsa::build_arch_graph(const llm_graph_params & params) const { |
150 | 0 | return std::make_unique<graph>(*this, params); |
151 | 0 | } |
152 | | |