/src/llama.cpp/ggml/src/ggml-cpu/repack.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #define GGML_COMMON_DECL_CPP |
4 | | #include "ggml-common.h" |
5 | | |
6 | | #include "traits.h" |
7 | | #include "ggml.h" |
8 | | |
9 | | // GGML internal header |
10 | | |
11 | | ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void); |
12 | | |
13 | 0 | template <int K> constexpr int QK_0() { |
14 | 0 | if constexpr (K == 4) { |
15 | 0 | return QK4_0; |
16 | 0 | } |
17 | 0 | if constexpr (K == 8) { |
18 | 0 | return QK8_0; |
19 | 0 | } |
20 | 0 | return -1; |
21 | 0 | } Unexecuted instantiation: int QK_0<4>() Unexecuted instantiation: int QK_0<8>() |
22 | | |
23 | | template <int K, int N> struct block { |
24 | | ggml_half d[N]; // deltas for N qK_0 blocks |
25 | | int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks |
26 | | }; |
27 | | |
28 | | // control size |
29 | | static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); |
30 | | static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); |
31 | | static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); |
32 | | static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); |
33 | | |
34 | | using block_q4_0x4 = block<4, 4>; |
35 | | using block_q4_0x8 = block<4, 8>; |
36 | | using block_q8_0x4 = block<8, 4>; |
37 | | using block_q8_0x8 = block<8, 8>; |
38 | | |
39 | | struct block_q4_Kx8 { |
40 | | ggml_half d[8]; // super-block scale for quantized scales |
41 | | ggml_half dmin[8]; // super-block scale for quantized mins |
42 | | uint8_t scales[96]; // scales and mins, quantized with 6 bits |
43 | | uint8_t qs[1024]; // 4--bit quants |
44 | | }; |
45 | | |
46 | | static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding"); |
47 | | struct block_q2_Kx8 { |
48 | | ggml_half d[8]; // super-block scale for quantized scales |
49 | | ggml_half dmin[8]; // super-block scale for quantized mins |
50 | | uint8_t scales[128]; // scales and mins, quantized with 4 bits |
51 | | uint8_t qs[512]; // 2--bit quants |
52 | | }; |
53 | | |
54 | | static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding"); |
55 | | struct block_q8_Kx4 { |
56 | | float d[4]; // delta |
57 | | int8_t qs[QK_K * 4]; // quants |
58 | | int16_t bsums[QK_K / 4]; // sum of quants in groups of 16 |
59 | | }; |
60 | | |
61 | | static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding"); |
62 | | |
63 | | struct block_iq4_nlx4 { |
64 | | ggml_half d[4]; // deltas for 4 iq4_nl blocks |
65 | | uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks |
66 | | }; |
67 | | |
68 | | static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); |
69 | | |
70 | | struct block_iq4_nlx8 { |
71 | | ggml_half d[8]; // deltas for 8 iq4_nl blocks |
72 | | uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks |
73 | | }; |
74 | | |
75 | | static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding"); |
76 | | |
77 | | #if defined(__cplusplus) |
78 | | extern "C" { |
79 | | #endif |
80 | | |
81 | | void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
82 | | void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
83 | | void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
84 | | void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
85 | | void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
86 | | void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
87 | | void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
88 | | void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
89 | | void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
90 | | void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
91 | | void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
92 | | void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
93 | | void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
94 | | void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
95 | | void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
96 | | void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
97 | | void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
98 | | void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
99 | | void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
100 | | void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
101 | | |
102 | | // Native implementations |
103 | | void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
104 | | void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
105 | | void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
106 | | void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
107 | | void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
108 | | void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
109 | | void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
110 | | void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
111 | | void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
112 | | void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
113 | | void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
114 | | void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
115 | | void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
116 | | void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
117 | | void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
118 | | void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
119 | | void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
120 | | void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
121 | | void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
122 | | void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
123 | | |
124 | | #if defined(__cplusplus) |
125 | | } // extern "C" |
126 | | #endif |