/src/llama.cpp/src/models/mamba2.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | 0 | void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) { |
4 | 0 | ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); |
5 | 0 | ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); |
6 | 0 | ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); |
7 | 0 | ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); |
8 | 0 | ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); |
9 | |
|
10 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
11 | |
|
12 | 0 | switch (hparams.n_layer()) { |
13 | 0 | case 24: |
14 | 0 | switch (hparams.n_embd) { |
15 | 0 | case 768: type = LLM_TYPE_SMALL; break; |
16 | 0 | default: type = LLM_TYPE_UNKNOWN; |
17 | 0 | } break; |
18 | 0 | case 48: |
19 | 0 | switch (hparams.n_embd) { |
20 | 0 | case 1024: type = LLM_TYPE_MEDIUM; break; |
21 | 0 | case 1536: type = LLM_TYPE_LARGE; break; |
22 | 0 | case 2048: type = LLM_TYPE_XL; break; |
23 | 0 | default: type = LLM_TYPE_UNKNOWN; |
24 | 0 | } break; |
25 | 0 | case 64: |
26 | 0 | switch (hparams.n_embd) { |
27 | 0 | case 2560: type = LLM_TYPE_3B; break; |
28 | 0 | case 4096: type = LLM_TYPE_7B; break; |
29 | 0 | default: type = LLM_TYPE_UNKNOWN; |
30 | 0 | } break; |
31 | 0 | default: type = LLM_TYPE_UNKNOWN; |
32 | 0 | } |
33 | 0 | } |
34 | | |
35 | 0 | void llama_model_mamba2::load_arch_tensors(llama_model_loader &) { |
36 | 0 | LLAMA_LOAD_LOCALS; |
37 | |
|
38 | 0 | const int64_t d_conv = hparams.ssm_d_conv; |
39 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
40 | 0 | const int64_t d_state = hparams.ssm_d_state; |
41 | 0 | const int64_t n_group = hparams.ssm_n_group; |
42 | 0 | const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head; |
43 | | |
44 | | // only an expansion factor of 2 is supported for now |
45 | 0 | GGML_ASSERT(2 * n_embd == d_inner); |
46 | |
|
47 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); |
48 | | |
49 | | // output |
50 | 0 | { |
51 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); |
52 | |
|
53 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); |
54 | | // if output is NULL, init from the input tok embed, duplicated to allow offloading |
55 | 0 | if (output == NULL) { |
56 | 0 | output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); |
57 | 0 | } |
58 | 0 | } |
59 | |
|
60 | 0 | for (int i = 0; i < n_layer; ++i) { |
61 | 0 | auto & layer = layers[i]; |
62 | | |
63 | | // norm |
64 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); |
65 | |
|
66 | 0 | layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); |
67 | |
|
68 | 0 | layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0); |
69 | 0 | layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0); |
70 | |
|
71 | 0 | layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0); |
72 | | |
73 | | // no "weight" suffix for these |
74 | 0 | layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0); |
75 | 0 | layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0); |
76 | |
|
77 | 0 | layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0); |
78 | | |
79 | | // out_proj |
80 | 0 | layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0); |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | 0 | std::unique_ptr<llm_graph_context> llama_model_mamba2::build_arch_graph(const llm_graph_params & params) const { |
85 | 0 | return std::make_unique<graph>(*this, params); |
86 | 0 | } |
87 | | |