/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp
Line | Count | Source |
1 | | #define GGML_COMMON_IMPL_CPP |
2 | | #define GGML_COMMON_DECL_CPP |
3 | | #include "ggml-common.h" |
4 | | #include "ggml-backend-impl.h" |
5 | | |
6 | | #include "ggml-impl.h" |
7 | | #include "ggml-cpu.h" |
8 | | #include "ggml-cpu-impl.h" |
9 | | #include "simd-mappings.h" |
10 | | #include "traits.h" |
11 | | |
12 | | #include "arch-fallback.h" |
13 | | |
14 | | #include <cmath> |
15 | | #include <cstring> |
16 | | #include <cassert> |
17 | | #include <cstdio> // for GGML_ASSERT |
18 | | |
19 | | #include "repack.h" |
20 | | |
21 | | #if defined(__GNUC__) |
22 | | #pragma GCC diagnostic ignored "-Woverlength-strings" |
23 | | #endif |
24 | | |
25 | 0 | #define UNUSED GGML_UNUSED |
26 | | |
27 | 0 | static inline int nearest_int(float fval) { |
28 | 0 | assert(fabsf(fval) <= 4194303.f); |
29 | 0 | float val = fval + 12582912.f; |
30 | 0 | int i; memcpy(&i, &val, sizeof(int)); |
31 | 0 | return (i & 0x007fffff) - 0x00400000; |
32 | 0 | } |
33 | | |
34 | | // Functions to create the interleaved data layout formats |
35 | | |
36 | | // interleave 4 block_q4_0s in blocks of blck_size_interleave |
37 | | // returns an interleaved block_q4_0x4 |
38 | | // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks |
39 | | // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave |
40 | | // |
41 | | // - in : an array of block_q4_0 pointers |
42 | | // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of |
43 | | // blck_size_interleave bytes |
44 | | // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes |
45 | | // from bias offset form to pure sign form (this saves subtract |
46 | | // operations durin unpacking) |
47 | | // |
48 | | |
49 | | extern "C" { |
50 | | |
51 | 0 | void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
52 | 0 | assert(QK8_0 == 32); |
53 | 0 | assert(k % QK8_0 == 0); |
54 | 0 | const int nb = k / QK8_0; |
55 | |
|
56 | 0 | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
57 | | |
58 | | // scalar |
59 | 0 | const int blck_size_interleave = 4; |
60 | 0 | float srcv[4][QK8_0]; |
61 | 0 | float id[4]; |
62 | |
|
63 | 0 | for (int i = 0; i < nb; i++) { |
64 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
65 | 0 | float amax = 0.0f; // absolute max |
66 | |
|
67 | 0 | for (int j = 0; j < QK8_0; j++) { |
68 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
69 | 0 | amax = MAX(amax, fabsf(srcv[row_iter][j])); |
70 | 0 | } |
71 | |
|
72 | 0 | const float d = amax / ((1 << 7) - 1); |
73 | 0 | id[row_iter] = d ? 1.0f / d : 0.0f; |
74 | |
|
75 | 0 | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
76 | 0 | } |
77 | |
|
78 | 0 | for (int j = 0; j < QK8_0 * 4; j++) { |
79 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
80 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
81 | 0 | src_offset += (j % blck_size_interleave); |
82 | |
|
83 | 0 | float x0 = srcv[src_id][src_offset] * id[src_id]; |
84 | 0 | y[i].qs[j] = roundf(x0); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | 0 | void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
90 | 0 | assert(QK8_0 == 32); |
91 | 0 | assert(k % QK8_0 == 0); |
92 | 0 | const int nb = k / QK8_0; |
93 | |
|
94 | 0 | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
95 | | |
96 | | // scalar |
97 | 0 | const int blck_size_interleave = 8; |
98 | 0 | float srcv[4][QK8_0]; |
99 | 0 | float id[4]; |
100 | |
|
101 | 0 | for (int i = 0; i < nb; i++) { |
102 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
103 | 0 | float amax = 0.0f; // absolute max |
104 | |
|
105 | 0 | for (int j = 0; j < QK8_0; j++) { |
106 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
107 | 0 | amax = MAX(amax, fabsf(srcv[row_iter][j])); |
108 | 0 | } |
109 | |
|
110 | 0 | const float d = amax / ((1 << 7) - 1); |
111 | 0 | id[row_iter] = d ? 1.0f / d : 0.0f; |
112 | |
|
113 | 0 | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
114 | 0 | } |
115 | |
|
116 | 0 | for (int j = 0; j < QK8_0 * 4; j++) { |
117 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
118 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
119 | 0 | src_offset += (j % blck_size_interleave); |
120 | |
|
121 | 0 | float x0 = srcv[src_id][src_offset] * id[src_id]; |
122 | 0 | y[i].qs[j] = roundf(x0); |
123 | 0 | } |
124 | 0 | } |
125 | 0 | } |
126 | | |
127 | | |
128 | 0 | void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
129 | 0 | assert(QK_K == 256); |
130 | 0 | assert(k % QK_K == 0); |
131 | 0 | const int nb = k / QK_K; |
132 | |
|
133 | 0 | block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; |
134 | | |
135 | | // scalar |
136 | 0 | const int blck_size_interleave = 4; |
137 | 0 | float srcv[4][QK_K]; |
138 | 0 | float iscale[4]; |
139 | |
|
140 | 0 | for (int i = 0; i < nb; i++) { |
141 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
142 | 0 | float amax = 0.0f; // absolute max |
143 | 0 | float max = 0; |
144 | |
|
145 | 0 | for (int j = 0; j < QK_K; j++) { |
146 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; |
147 | | // Update the maximum value of the corresponding super block |
148 | 0 | if(amax < fabsf(srcv[row_iter][j])) { |
149 | 0 | amax = fabsf(srcv[row_iter][j]); |
150 | 0 | max = srcv[row_iter][j]; |
151 | 0 | } |
152 | 0 | } |
153 | |
|
154 | 0 | iscale[row_iter] = amax ? -127.f/max : 0; |
155 | |
|
156 | 0 | y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0; |
157 | 0 | } |
158 | |
|
159 | 0 | for (int j = 0; j < QK_K / 4; j++) { |
160 | 0 | y[i].bsums[j] = 0; |
161 | 0 | } |
162 | | |
163 | | // Quants values are interleaved in sequence of four bytes from corresponding super blocks |
164 | | // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving |
165 | | // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on |
166 | 0 | for (int j = 0; j < QK_K * 4; j++) { |
167 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
168 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
169 | 0 | src_offset += (j % blck_size_interleave); |
170 | 0 | int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3); |
171 | |
|
172 | 0 | float x0 = srcv[src_id][src_offset] * iscale[src_id]; |
173 | 0 | y[i].qs[j] = nearest_int(x0); |
174 | 0 | y[i].bsums[index] += y[i].qs[j]; |
175 | 0 | } |
176 | 0 | } |
177 | 0 | } |
178 | | |
179 | 0 | void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
180 | 0 | assert(QK_K == 256); |
181 | 0 | assert(k % QK_K == 0); |
182 | 0 | const int nb = k / QK_K; |
183 | |
|
184 | 0 | block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; |
185 | | |
186 | | // scalar |
187 | 0 | const int blck_size_interleave = 8; |
188 | 0 | float srcv[4][QK_K]; |
189 | 0 | float iscale[4]; |
190 | |
|
191 | 0 | for (int i = 0; i < nb; i++) { |
192 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
193 | 0 | float amax = 0.0f; // absolute max |
194 | 0 | float max = 0; |
195 | |
|
196 | 0 | for (int j = 0; j < QK_K; j++) { |
197 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; |
198 | | // Update the maximum value of the corresponding super block |
199 | 0 | if(amax < fabsf(srcv[row_iter][j])) { |
200 | 0 | amax = fabsf(srcv[row_iter][j]); |
201 | 0 | max = srcv[row_iter][j]; |
202 | 0 | } |
203 | 0 | } |
204 | |
|
205 | 0 | iscale[row_iter] = amax ? -127.f/max : 0; |
206 | |
|
207 | 0 | y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0; |
208 | 0 | } |
209 | |
|
210 | 0 | for (int j = 0; j < QK_K / 4; j++) { |
211 | 0 | y[i].bsums[j] = 0; |
212 | 0 | } |
213 | | |
214 | | // Quants values are interleaved in sequence of eight bytes from corresponding super blocks |
215 | | // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving |
216 | | // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on |
217 | 0 | for (int j = 0; j < QK_K * 4; j++) { |
218 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
219 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
220 | 0 | src_offset += (j % blck_size_interleave); |
221 | 0 | int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3); |
222 | |
|
223 | 0 | float x0 = srcv[src_id][src_offset] * iscale[src_id]; |
224 | 0 | y[i].qs[j] = nearest_int(x0); |
225 | 0 | y[i].bsums[index] += y[i].qs[j]; |
226 | 0 | } |
227 | 0 | } |
228 | 0 | } |
229 | | |
230 | | } // extern "C" |
231 | | |
232 | | template <int64_t INTER_SIZE, ggml_type PARAM_TYPE> |
233 | | void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); |
234 | | |
235 | 0 | template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
236 | 0 | assert(nrow == 4); |
237 | 0 | UNUSED(nrow); |
238 | 0 | ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); |
239 | 0 | } |
240 | | |
241 | 0 | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
242 | 0 | assert(nrow == 4); |
243 | 0 | UNUSED(nrow); |
244 | 0 | ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); |
245 | 0 | } |
246 | | |
247 | 0 | template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
248 | 0 | assert(nrow == 4); |
249 | 0 | UNUSED(nrow); |
250 | 0 | ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row); |
251 | 0 | } |
252 | | |
253 | 0 | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
254 | 0 | assert(nrow == 4); |
255 | 0 | UNUSED(nrow); |
256 | 0 | ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); |
257 | 0 | } |
258 | | |
259 | | extern "C" { |
260 | | |
261 | 0 | void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
262 | 0 | const int qk = QK8_0; |
263 | 0 | const int nb = n / qk; |
264 | 0 | const int ncols_interleaved = 4; |
265 | 0 | const int blocklen = 4; |
266 | |
|
267 | 0 | assert(nr == 1); |
268 | 0 | assert(n % qk == 0); |
269 | 0 | assert(nc % ncols_interleaved == 0); |
270 | |
|
271 | 0 | UNUSED(s); |
272 | 0 | UNUSED(bs); |
273 | 0 | UNUSED(vx); |
274 | 0 | UNUSED(vy); |
275 | 0 | UNUSED(nr); |
276 | 0 | UNUSED(nc); |
277 | 0 | UNUSED(nb); |
278 | 0 | UNUSED(ncols_interleaved); |
279 | 0 | UNUSED(blocklen); |
280 | |
|
281 | 0 | float sumf[4]; |
282 | 0 | int sumi; |
283 | |
|
284 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
285 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
286 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
287 | |
|
288 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
289 | 0 | for (int l = 0; l < nb; l++) { |
290 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
291 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
292 | 0 | sumi = 0; |
293 | 0 | for (int i = 0; i < blocklen; ++i) { |
294 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
295 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
296 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
297 | 0 | } |
298 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
299 | 0 | } |
300 | 0 | } |
301 | 0 | } |
302 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
303 | 0 | } |
304 | 0 | } |
305 | | |
306 | 0 | void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
307 | 0 | const int qk = QK8_0; |
308 | 0 | const int nb = n / qk; |
309 | 0 | const int ncols_interleaved = 4; |
310 | 0 | const int blocklen = 8; |
311 | |
|
312 | 0 | assert (n % qk == 0); |
313 | 0 | assert (nc % ncols_interleaved == 0); |
314 | |
|
315 | 0 | UNUSED(s); |
316 | 0 | UNUSED(bs); |
317 | 0 | UNUSED(vx); |
318 | 0 | UNUSED(vy); |
319 | 0 | UNUSED(nr); |
320 | 0 | UNUSED(nc); |
321 | 0 | UNUSED(nb); |
322 | 0 | UNUSED(ncols_interleaved); |
323 | 0 | UNUSED(blocklen); |
324 | |
|
325 | 0 | float sumf[4]; |
326 | 0 | int sumi; |
327 | |
|
328 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
329 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
330 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
331 | |
|
332 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
333 | 0 | for (int l = 0; l < nb; l++) { |
334 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
335 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
336 | 0 | sumi = 0; |
337 | 0 | for (int i = 0; i < blocklen; ++i) { |
338 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
339 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
340 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
341 | 0 | } |
342 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | } |
346 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
347 | 0 | } |
348 | 0 | } |
349 | | |
350 | 0 | void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
351 | 0 | const int qk = QK8_0; |
352 | 0 | const int nb = n / qk; |
353 | 0 | const int ncols_interleaved = 8; |
354 | 0 | const int blocklen = 8; |
355 | |
|
356 | 0 | assert (n % qk == 0); |
357 | 0 | assert (nc % ncols_interleaved == 0); |
358 | |
|
359 | 0 | UNUSED(s); |
360 | 0 | UNUSED(bs); |
361 | 0 | UNUSED(vx); |
362 | 0 | UNUSED(vy); |
363 | 0 | UNUSED(nr); |
364 | 0 | UNUSED(nc); |
365 | 0 | UNUSED(nb); |
366 | 0 | UNUSED(ncols_interleaved); |
367 | 0 | UNUSED(blocklen); |
368 | |
|
369 | 0 | float sumf[8]; |
370 | 0 | int sumi; |
371 | |
|
372 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
373 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
374 | 0 | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
375 | |
|
376 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
377 | 0 | for (int l = 0; l < nb; l++) { |
378 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
379 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
380 | 0 | sumi = 0; |
381 | 0 | for (int i = 0; i < blocklen; ++i) { |
382 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
383 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
384 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
385 | 0 | } |
386 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
387 | 0 | } |
388 | 0 | } |
389 | 0 | } |
390 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
391 | 0 | } |
392 | 0 | } |
393 | | |
394 | 0 | void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
395 | 0 | const int qk = QK_K; |
396 | 0 | const int nb = n / qk; |
397 | 0 | const int ncols_interleaved = 8; |
398 | 0 | const int blocklen = 4; |
399 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
400 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
401 | 0 | static const uint32_t kmask3 = 0x03030303; |
402 | |
|
403 | 0 | assert (n % qk == 0); |
404 | 0 | assert (nc % ncols_interleaved == 0); |
405 | |
|
406 | 0 | UNUSED(bs); |
407 | 0 | UNUSED(nr); |
408 | |
|
409 | 0 | float sumf[8]; |
410 | 0 | float sum_minf[8]; |
411 | 0 | uint32_t utmp[32]; |
412 | 0 | int sumi1; |
413 | 0 | int sumi2; |
414 | 0 | int sumi; |
415 | |
|
416 | 0 | const block_q8_K * a_ptr = (const block_q8_K *) vy; |
417 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
418 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
419 | |
|
420 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
421 | 0 | sumf[j] = 0.0; |
422 | 0 | sum_minf[j] = 0.0; |
423 | 0 | } |
424 | 0 | for (int l = 0; l < nb; l++) { |
425 | 0 | for (int sb = 0; sb < 8; sb++) { |
426 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
427 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
428 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
429 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
430 | 0 | utmp[sb * 4 + 2] = uaux_0; |
431 | 0 | utmp[sb * 4 + 0] &= kmask1; |
432 | 0 | } |
433 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
434 | 0 | uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32; |
435 | 0 | uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16; |
436 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
437 | 0 | sumi1 = 0; |
438 | 0 | sumi2 = 0; |
439 | 0 | sumi = 0; |
440 | 0 | for (int i = 0; i < blocklen; ++i) { |
441 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
442 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
443 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]); |
444 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]); |
445 | 0 | sumi1 = sumi1 * scales_0[j]; |
446 | 0 | sumi2 = sumi2 * scales_1[j]; |
447 | 0 | sumi += sumi1 + sumi2; |
448 | 0 | } |
449 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
450 | 0 | } |
451 | 0 | } |
452 | 0 | for (int sb = 0; sb < 8; sb++) { |
453 | 0 | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16; |
454 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
455 | 0 | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
456 | 0 | } |
457 | 0 | } |
458 | 0 | } |
459 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
460 | 0 | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
461 | 0 | } |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | 0 | void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
466 | 0 | const int qk = QK_K; |
467 | 0 | const int nb = n / qk; |
468 | 0 | const int ncols_interleaved = 8; |
469 | 0 | const int blocklen = 8; |
470 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
471 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
472 | 0 | static const uint32_t kmask3 = 0x03030303; |
473 | |
|
474 | 0 | assert (n % qk == 0); |
475 | 0 | assert (nc % ncols_interleaved == 0); |
476 | |
|
477 | 0 | UNUSED(s); |
478 | 0 | UNUSED(bs); |
479 | 0 | UNUSED(vx); |
480 | 0 | UNUSED(vy); |
481 | 0 | UNUSED(nr); |
482 | 0 | UNUSED(nc); |
483 | 0 | UNUSED(nb); |
484 | 0 | UNUSED(ncols_interleaved); |
485 | 0 | UNUSED(blocklen); |
486 | |
|
487 | 0 | float sumf[8]; |
488 | 0 | float sum_minf[8]; |
489 | 0 | uint32_t utmp[32]; |
490 | 0 | int sumi1; |
491 | 0 | int sumi2; |
492 | 0 | int sumi; |
493 | |
|
494 | 0 | const block_q8_K * a_ptr = (const block_q8_K *) vy; |
495 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
496 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
497 | |
|
498 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
499 | 0 | sumf[j] = 0.0; |
500 | 0 | sum_minf[j] = 0.0; |
501 | 0 | } |
502 | 0 | for (int l = 0; l < nb; l++) { |
503 | 0 | for (int sb = 0; sb < 8; sb++) { |
504 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
505 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
506 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
507 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
508 | 0 | utmp[sb * 4 + 2] = uaux_0; |
509 | 0 | utmp[sb * 4 + 0] &= kmask1; |
510 | 0 | } |
511 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
512 | 0 | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
513 | 0 | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
514 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
515 | 0 | sumi1 = 0; |
516 | 0 | sumi2 = 0; |
517 | 0 | sumi = 0; |
518 | 0 | for (int i = 0; i < blocklen; ++i) { |
519 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
520 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
521 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]); |
522 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]); |
523 | 0 | sumi1 = sumi1 * scales_0[j]; |
524 | 0 | sumi2 = sumi2 * scales_1[j]; |
525 | 0 | sumi += sumi1 + sumi2; |
526 | 0 | } |
527 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
528 | 0 | } |
529 | 0 | } |
530 | 0 | for (int sb = 0; sb < 8; sb++) { |
531 | 0 | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
532 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
533 | 0 | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
534 | 0 | } |
535 | 0 | } |
536 | 0 | } |
537 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
538 | 0 | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
539 | 0 | } |
540 | 0 | } |
541 | 0 | } |
542 | | |
543 | 0 | void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
544 | 0 | const int qk = QK_K; |
545 | 0 | const int nb = n / qk; |
546 | 0 | const int ncols_interleaved = 8; |
547 | 0 | const int blocklen = 8; |
548 | |
|
549 | 0 | assert (n % qk == 0); |
550 | 0 | assert (nc % ncols_interleaved == 0); |
551 | |
|
552 | 0 | UNUSED(s); |
553 | 0 | UNUSED(bs); |
554 | 0 | UNUSED(vx); |
555 | 0 | UNUSED(vy); |
556 | 0 | UNUSED(nr); |
557 | 0 | UNUSED(nc); |
558 | 0 | UNUSED(nb); |
559 | 0 | UNUSED(ncols_interleaved); |
560 | 0 | UNUSED(blocklen); |
561 | |
|
562 | 0 | float sumf[8]; |
563 | 0 | float sum_minf[8]; |
564 | 0 | int sumi1,sumi2,sumi3,sumi4; |
565 | 0 | int sumi; |
566 | |
|
567 | 0 | const block_q8_K * a_ptr = (const block_q8_K *)vy; |
568 | 0 | for(int x = 0; x < nc / ncols_interleaved; x++) { |
569 | 0 | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); |
570 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
571 | 0 | sumf[j] = 0.0; |
572 | 0 | sum_minf[j] = 0.0; |
573 | 0 | } |
574 | 0 | for (int l = 0; l < nb; l++) { |
575 | 0 | for (int k = 0; k < (qk / (4 * blocklen)); k++) { |
576 | 0 | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; |
577 | 0 | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; |
578 | 0 | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; |
579 | 0 | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; |
580 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
581 | 0 | sumi1 = 0; |
582 | 0 | sumi2 = 0; |
583 | 0 | sumi3 = 0; |
584 | 0 | sumi4 = 0; |
585 | 0 | sumi = 0; |
586 | 0 | int offset = ((k / 2) % 2) + j * 2; |
587 | 0 | for (int i = 0; i < blocklen; ++i){ |
588 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3); |
589 | 0 | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3); |
590 | 0 | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3); |
591 | 0 | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3); |
592 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); |
593 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); |
594 | 0 | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]); |
595 | 0 | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]); |
596 | |
|
597 | 0 | sumi1 = sumi1 * (scales_0[offset] & 0xF); |
598 | 0 | sumi2 = sumi2 * (scales_1[offset] & 0xF); |
599 | 0 | sumi3 = sumi3 * (scales_2[offset] & 0xF); |
600 | 0 | sumi4 = sumi4 * (scales_3[offset] & 0xF); |
601 | 0 | sumi += sumi1 + sumi2 + sumi3 + sumi4; |
602 | 0 | } |
603 | 0 | sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
604 | 0 | } |
605 | 0 | } |
606 | 0 | for(int sb = 0; sb < 8; sb++) { |
607 | 0 | const uint8_t *mins = b_ptr[l].scales + sb * 16; |
608 | 0 | for(int j = 0; j < ncols_interleaved; j++){ |
609 | 0 | sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
610 | 0 | } |
611 | 0 | } |
612 | 0 | } |
613 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
614 | 0 | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
615 | 0 | } |
616 | 0 | } |
617 | 0 | } |
618 | | |
619 | 0 | void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
620 | 0 | const int qk = QK8_0; |
621 | 0 | const int nb = n / qk; |
622 | 0 | const int ncols_interleaved = 4; |
623 | 0 | const int blocklen = 4; |
624 | |
|
625 | 0 | assert(nr == 1); |
626 | 0 | assert(n % qk == 0); |
627 | 0 | assert(nc % ncols_interleaved == 0); |
628 | |
|
629 | 0 | UNUSED(bs); |
630 | 0 | UNUSED(nr); |
631 | |
|
632 | 0 | float sumf[4]; |
633 | 0 | int sumi; |
634 | |
|
635 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
636 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
637 | 0 | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
638 | |
|
639 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
640 | 0 | for (int l = 0; l < nb; l++) { |
641 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
642 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
643 | 0 | sumi = 0; |
644 | 0 | for (int i = 0; i < blocklen; ++i) { |
645 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
646 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
647 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); |
648 | 0 | } |
649 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
650 | 0 | } |
651 | 0 | } |
652 | 0 | } |
653 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
654 | 0 | } |
655 | 0 | } |
656 | | |
657 | 0 | void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
658 | 0 | const int qk = QK8_0; |
659 | 0 | const int nb = n / qk; |
660 | 0 | const int ncols_interleaved = 8; |
661 | 0 | const int blocklen = 8; |
662 | |
|
663 | 0 | assert(nr == 1); |
664 | 0 | assert(n % qk == 0); |
665 | 0 | assert(nc % ncols_interleaved == 0); |
666 | |
|
667 | 0 | UNUSED(bs); |
668 | 0 | UNUSED(nr); |
669 | |
|
670 | 0 | float sumf[8]; |
671 | 0 | int sumi; |
672 | |
|
673 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
674 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
675 | 0 | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb); |
676 | |
|
677 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
678 | 0 | for (int l = 0; l < nb; l++) { |
679 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
680 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
681 | 0 | sumi = 0; |
682 | 0 | for (int i = 0; i < blocklen; ++i) { |
683 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
684 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
685 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); |
686 | 0 | } |
687 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
688 | 0 | } |
689 | 0 | } |
690 | 0 | } |
691 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
692 | 0 | } |
693 | 0 | } |
694 | | |
695 | | void ggml_gemv_q8_0_4x4_q8_0_generic(int n, |
696 | | float * GGML_RESTRICT s, |
697 | | size_t bs, |
698 | | const void * GGML_RESTRICT vx, |
699 | | const void * GGML_RESTRICT vy, |
700 | | int nr, |
701 | 0 | int nc) { |
702 | 0 | const int qk = QK8_0; |
703 | 0 | const int nb = n / qk; |
704 | 0 | const int ncols_interleaved = 4; |
705 | 0 | const int blocklen = 4; |
706 | |
|
707 | 0 | assert(nr == 1); |
708 | 0 | assert(n % qk == 0); |
709 | 0 | assert(nc % ncols_interleaved == 0); |
710 | |
|
711 | 0 | UNUSED(bs); |
712 | 0 | UNUSED(nr); |
713 | |
|
714 | 0 | float sumf[4]; |
715 | 0 | int sumi; |
716 | |
|
717 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
718 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
719 | 0 | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb); |
720 | |
|
721 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
722 | 0 | sumf[j] = 0.0; |
723 | 0 | } |
724 | 0 | for (int l = 0; l < nb; l++) { |
725 | 0 | for (int k = 0; k < (qk / blocklen); k++) { |
726 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
727 | 0 | sumi = 0; |
728 | 0 | for (int i = 0; i < blocklen; ++i) { |
729 | 0 | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]; |
730 | 0 | sumi += v0 * a_ptr[l].qs[k * blocklen + i]; |
731 | 0 | } |
732 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
733 | 0 | } |
734 | 0 | } |
735 | 0 | } |
736 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
737 | 0 | s[x * ncols_interleaved + j] = sumf[j]; |
738 | 0 | } |
739 | 0 | } |
740 | 0 | } |
741 | | |
742 | | void ggml_gemv_q8_0_4x8_q8_0_generic(int n, |
743 | | float * GGML_RESTRICT s, |
744 | | size_t bs, |
745 | | const void * GGML_RESTRICT vx, |
746 | | const void * GGML_RESTRICT vy, |
747 | | int nr, |
748 | 0 | int nc) { |
749 | 0 | const int qk = QK8_0; |
750 | 0 | const int nb = n / qk; |
751 | 0 | const int ncols_interleaved = 4; |
752 | 0 | const int blocklen = 8; |
753 | |
|
754 | 0 | assert(nr == 1); |
755 | 0 | assert(n % qk == 0); |
756 | 0 | assert(nc % ncols_interleaved == 0); |
757 | |
|
758 | 0 | UNUSED(bs); |
759 | 0 | UNUSED(nr); |
760 | |
|
761 | 0 | float sumf[4]; |
762 | 0 | int sumi; |
763 | |
|
764 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
765 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
766 | 0 | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb); |
767 | |
|
768 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
769 | 0 | sumf[j] = 0.0; |
770 | 0 | } |
771 | 0 | for (int l = 0; l < nb; l++) { |
772 | 0 | for (int k = 0; k < (qk / blocklen); k++) { |
773 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
774 | 0 | sumi = 0; |
775 | 0 | for (int i = 0; i < blocklen; ++i) { |
776 | 0 | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]; |
777 | 0 | sumi += v0 * a_ptr[l].qs[k * blocklen + i]; |
778 | 0 | } |
779 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
780 | 0 | } |
781 | 0 | } |
782 | 0 | } |
783 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
784 | 0 | s[x * ncols_interleaved + j] = sumf[j]; |
785 | 0 | } |
786 | 0 | } |
787 | 0 | } |
788 | | |
789 | 0 | void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
790 | 0 | const int qk = QK8_0; |
791 | 0 | const int nb = n / qk; |
792 | 0 | const int ncols_interleaved = 4; |
793 | 0 | const int blocklen = 4; |
794 | |
|
795 | 0 | assert (n % qk == 0); |
796 | 0 | assert (nr % 4 == 0); |
797 | 0 | assert (nc % ncols_interleaved == 0); |
798 | |
|
799 | 0 | UNUSED(s); |
800 | 0 | UNUSED(bs); |
801 | 0 | UNUSED(vx); |
802 | 0 | UNUSED(vy); |
803 | 0 | UNUSED(nr); |
804 | 0 | UNUSED(nc); |
805 | 0 | UNUSED(nb); |
806 | 0 | UNUSED(ncols_interleaved); |
807 | 0 | UNUSED(blocklen); |
808 | |
|
809 | 0 | { |
810 | 0 | float sumf[4][4]; |
811 | 0 | int sumi; |
812 | |
|
813 | 0 | for (int y = 0; y < nr / 4; y++) { |
814 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
815 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
816 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
817 | 0 | for (int m = 0; m < 4; m++) { |
818 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
819 | 0 | } |
820 | 0 | for (int l = 0; l < nb; l++) { |
821 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
822 | 0 | for (int m = 0; m < 4; m++) { |
823 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
824 | 0 | sumi = 0; |
825 | 0 | for (int i = 0; i < blocklen; ++i) { |
826 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
827 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
828 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
829 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
830 | 0 | } |
831 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
832 | 0 | } |
833 | 0 | } |
834 | 0 | } |
835 | 0 | } |
836 | 0 | for (int m = 0; m < 4; m++) { |
837 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
838 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
839 | 0 | } |
840 | 0 | } |
841 | 0 | } |
842 | 0 | } |
843 | 0 | } |
844 | | |
845 | 0 | void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
846 | 0 | const int qk = QK8_0; |
847 | 0 | const int nb = n / qk; |
848 | 0 | const int ncols_interleaved = 4; |
849 | 0 | const int blocklen = 8; |
850 | |
|
851 | 0 | assert (n % qk == 0); |
852 | 0 | assert (nr % 4 == 0); |
853 | 0 | assert (nc % ncols_interleaved == 0); |
854 | |
|
855 | 0 | UNUSED(s); |
856 | 0 | UNUSED(bs); |
857 | 0 | UNUSED(vx); |
858 | 0 | UNUSED(vy); |
859 | 0 | UNUSED(nr); |
860 | 0 | UNUSED(nc); |
861 | 0 | UNUSED(nb); |
862 | 0 | UNUSED(ncols_interleaved); |
863 | 0 | UNUSED(blocklen); |
864 | |
|
865 | 0 | float sumf[4][4]; |
866 | 0 | int sumi; |
867 | |
|
868 | 0 | for (int y = 0; y < nr / 4; y++) { |
869 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
870 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
871 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
872 | 0 | for (int m = 0; m < 4; m++) { |
873 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
874 | 0 | } |
875 | 0 | for (int l = 0; l < nb; l++) { |
876 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
877 | 0 | for (int m = 0; m < 4; m++) { |
878 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
879 | 0 | sumi = 0; |
880 | 0 | for (int i = 0; i < blocklen; ++i) { |
881 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
882 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
883 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
884 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
885 | 0 | } |
886 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
887 | 0 | } |
888 | 0 | } |
889 | 0 | } |
890 | 0 | } |
891 | 0 | for (int m = 0; m < 4; m++) { |
892 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
893 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
894 | 0 | } |
895 | 0 | } |
896 | 0 | } |
897 | 0 | } |
898 | | |
899 | 0 | void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
900 | 0 | const int qk = QK8_0; |
901 | 0 | const int nb = n / qk; |
902 | 0 | const int ncols_interleaved = 8; |
903 | 0 | const int blocklen = 8; |
904 | |
|
905 | 0 | assert (n % qk == 0); |
906 | 0 | assert (nr % 4 == 0); |
907 | 0 | assert (nc % ncols_interleaved == 0); |
908 | |
|
909 | 0 | UNUSED(s); |
910 | 0 | UNUSED(bs); |
911 | 0 | UNUSED(vx); |
912 | 0 | UNUSED(vy); |
913 | 0 | UNUSED(nr); |
914 | 0 | UNUSED(nc); |
915 | 0 | UNUSED(nb); |
916 | 0 | UNUSED(ncols_interleaved); |
917 | 0 | UNUSED(blocklen); |
918 | |
|
919 | 0 | float sumf[4][8]; |
920 | 0 | int sumi; |
921 | |
|
922 | 0 | for (int y = 0; y < nr / 4; y++) { |
923 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
924 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
925 | 0 | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
926 | 0 | for (int m = 0; m < 4; m++) { |
927 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
928 | 0 | } |
929 | 0 | for (int l = 0; l < nb; l++) { |
930 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
931 | 0 | for (int m = 0; m < 4; m++) { |
932 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
933 | 0 | sumi = 0; |
934 | 0 | for (int i = 0; i < blocklen; ++i) { |
935 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
936 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
937 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
938 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
939 | 0 | } |
940 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
941 | 0 | } |
942 | 0 | } |
943 | 0 | } |
944 | 0 | } |
945 | 0 | for (int m = 0; m < 4; m++) { |
946 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
947 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
948 | 0 | } |
949 | 0 | } |
950 | 0 | } |
951 | 0 | } |
952 | | |
953 | 0 | void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
954 | 0 | const int qk = QK_K; |
955 | 0 | const int nb = n / qk; |
956 | 0 | const int ncols_interleaved = 8; |
957 | 0 | const int blocklen = 4; |
958 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
959 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
960 | 0 | static const uint32_t kmask3 = 0x03030303; |
961 | |
|
962 | 0 | assert (n % qk == 0); |
963 | 0 | assert (nr % 4 == 0); |
964 | 0 | assert (nc % ncols_interleaved == 0); |
965 | |
|
966 | 0 | UNUSED(nb); |
967 | 0 | UNUSED(ncols_interleaved); |
968 | 0 | UNUSED(blocklen); |
969 | |
|
970 | 0 | float sumf[4][8]; |
971 | 0 | float sum_minf[4][8]; |
972 | 0 | uint32_t utmp[32]; |
973 | 0 | int sumi1; |
974 | 0 | int sumi2; |
975 | 0 | int sumi; |
976 | |
|
977 | 0 | for (int y = 0; y < nr / 4; y++) { |
978 | 0 | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
979 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
980 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
981 | 0 | for (int m = 0; m < 4; m++) { |
982 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
983 | 0 | sumf[m][j] = 0.0; |
984 | 0 | sum_minf[m][j] = 0.0; |
985 | 0 | } |
986 | 0 | } |
987 | 0 | for (int l = 0; l < nb; l++) { |
988 | 0 | for (int sb = 0; sb < 8; sb++) { |
989 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
990 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
991 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
992 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
993 | 0 | utmp[sb * 4 + 2] = uaux_0; |
994 | 0 | utmp[sb * 4 + 0] &= kmask1; |
995 | 0 | } |
996 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
997 | 0 | uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32; |
998 | 0 | uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16; |
999 | 0 | for (int m = 0; m < 4; m++) { |
1000 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1001 | 0 | sumi1 = 0; |
1002 | 0 | sumi2 = 0; |
1003 | 0 | sumi = 0; |
1004 | 0 | for (int i = 0; i < blocklen; ++i) { |
1005 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
1006 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
1007 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]); |
1008 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]); |
1009 | 0 | sumi1 = sumi1 * scales_0[j]; |
1010 | 0 | sumi2 = sumi2 * scales_1[j]; |
1011 | 0 | sumi += sumi1 + sumi2; |
1012 | 0 | } |
1013 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
1014 | 0 | } |
1015 | 0 | } |
1016 | 0 | } |
1017 | 0 | for (int sb = 0; sb < 8; sb++) { |
1018 | 0 | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16; |
1019 | 0 | for(int m = 0; m < 4; m++) { |
1020 | 0 | const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
1021 | 0 | for(int j = 0; j < ncols_interleaved; j++) { |
1022 | 0 | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
1023 | 0 | } |
1024 | 0 | } |
1025 | 0 | } |
1026 | 0 | } |
1027 | 0 | for (int m = 0; m < 4; m++) { |
1028 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1029 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | } |
1033 | 0 | } |
1034 | 0 | } |
1035 | | |
1036 | 0 | void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
1037 | 0 | const int qk = QK_K; |
1038 | 0 | const int nb = n / qk; |
1039 | 0 | const int ncols_interleaved = 8; |
1040 | 0 | const int blocklen = 8; |
1041 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
1042 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
1043 | 0 | static const uint32_t kmask3 = 0x03030303; |
1044 | |
|
1045 | 0 | assert (n % qk == 0); |
1046 | 0 | assert (nr % 4 == 0); |
1047 | 0 | assert (nc % ncols_interleaved == 0); |
1048 | |
|
1049 | 0 | UNUSED(s); |
1050 | 0 | UNUSED(bs); |
1051 | 0 | UNUSED(vx); |
1052 | 0 | UNUSED(vy); |
1053 | 0 | UNUSED(nr); |
1054 | 0 | UNUSED(nc); |
1055 | 0 | UNUSED(nb); |
1056 | 0 | UNUSED(ncols_interleaved); |
1057 | 0 | UNUSED(blocklen); |
1058 | |
|
1059 | 0 | float sumf[4][8]; |
1060 | 0 | float sum_minf[4][8]; |
1061 | 0 | uint32_t utmp[32]; |
1062 | 0 | int sumi1; |
1063 | 0 | int sumi2; |
1064 | 0 | int sumi; |
1065 | |
|
1066 | 0 | for (int y = 0; y < nr / 4; y++) { |
1067 | 0 | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
1068 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1069 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
1070 | 0 | for (int m = 0; m < 4; m++) { |
1071 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1072 | 0 | sumf[m][j] = 0.0; |
1073 | 0 | sum_minf[m][j] = 0.0; |
1074 | 0 | } |
1075 | 0 | } |
1076 | 0 | for (int l = 0; l < nb; l++) { |
1077 | 0 | for (int sb = 0; sb < 8; sb++) { |
1078 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
1079 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
1080 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
1081 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
1082 | 0 | utmp[sb * 4 + 2] = uaux_0; |
1083 | 0 | utmp[sb * 4 + 0] &= kmask1; |
1084 | 0 | } |
1085 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
1086 | 0 | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
1087 | 0 | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
1088 | 0 | for (int m = 0; m < 4; m++) { |
1089 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1090 | 0 | sumi1 = 0; |
1091 | 0 | sumi2 = 0; |
1092 | 0 | sumi = 0; |
1093 | 0 | for (int i = 0; i < blocklen; ++i) { |
1094 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
1095 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
1096 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]); |
1097 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); |
1098 | 0 | sumi1 = sumi1 * scales_0[j]; |
1099 | 0 | sumi2 = sumi2 * scales_1[j]; |
1100 | 0 | sumi += sumi1 + sumi2; |
1101 | 0 | } |
1102 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
1103 | 0 | } |
1104 | 0 | } |
1105 | 0 | } |
1106 | 0 | for (int sb = 0; sb < 8; sb++) { |
1107 | 0 | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
1108 | 0 | for(int m = 0; m < 4; m++) { |
1109 | 0 | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
1110 | 0 | for(int j = 0; j < ncols_interleaved; j++) { |
1111 | 0 | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
1112 | 0 | } |
1113 | 0 | } |
1114 | 0 | } |
1115 | 0 | } |
1116 | 0 | for (int m = 0; m < 4; m++) { |
1117 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1118 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
1119 | 0 | } |
1120 | 0 | } |
1121 | 0 | } |
1122 | 0 | } |
1123 | 0 | } |
1124 | | |
1125 | 0 | void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
1126 | 0 | const int qk = QK_K; |
1127 | 0 | const int nb = n / qk; |
1128 | 0 | const int ncols_interleaved = 8; |
1129 | 0 | const int blocklen = 8; |
1130 | |
|
1131 | 0 | assert (n % qk == 0); |
1132 | 0 | assert (nr % 4 == 0); |
1133 | 0 | assert (nc % ncols_interleaved == 0); |
1134 | |
|
1135 | 0 | UNUSED(s); |
1136 | 0 | UNUSED(bs); |
1137 | 0 | UNUSED(vx); |
1138 | 0 | UNUSED(vy); |
1139 | 0 | UNUSED(nr); |
1140 | 0 | UNUSED(nc); |
1141 | 0 | UNUSED(nb); |
1142 | 0 | UNUSED(ncols_interleaved); |
1143 | 0 | UNUSED(blocklen); |
1144 | |
|
1145 | 0 | float sumf[4][8]; |
1146 | 0 | float sum_minf[4][8]; |
1147 | 0 | int sumi1, sumi2, sumi3, sumi4; |
1148 | 0 | int sumi; |
1149 | |
|
1150 | 0 | for (int y = 0; y < nr / 4; y++) { |
1151 | 0 | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
1152 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1153 | 0 | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); |
1154 | 0 | for (int m = 0; m < 4; m++) { |
1155 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1156 | 0 | sumf[m][j] = 0.0; |
1157 | 0 | sum_minf[m][j] = 0.0; |
1158 | 0 | } |
1159 | 0 | } |
1160 | 0 | for (int l = 0; l < nb; l++) { |
1161 | 0 | for (int k = 0; k < (qk / (4 * blocklen)); k++) { |
1162 | |
|
1163 | 0 | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; |
1164 | 0 | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; |
1165 | 0 | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; |
1166 | 0 | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; |
1167 | 0 | for (int m = 0; m < 4; m++) { |
1168 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1169 | 0 | sumi1 = 0; |
1170 | 0 | sumi2 = 0; |
1171 | 0 | sumi3 = 0; |
1172 | 0 | sumi4 = 0; |
1173 | 0 | sumi = 0; |
1174 | 0 | int offset = ((k / 2) % 2) + j * 2; |
1175 | 0 | for (int i = 0; i < blocklen; ++i){ |
1176 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3); |
1177 | 0 | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3); |
1178 | 0 | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3); |
1179 | 0 | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3); |
1180 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); |
1181 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); |
1182 | 0 | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]); |
1183 | 0 | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]); |
1184 | 0 | sumi1 = sumi1 * (scales_0[offset] & 0xF); |
1185 | 0 | sumi2 = sumi2 * (scales_1[offset] & 0xF); |
1186 | 0 | sumi3 = sumi3 * (scales_2[offset] & 0xF); |
1187 | 0 | sumi4 = sumi4 * (scales_3[offset] & 0xF); |
1188 | 0 | sumi += sumi1 + sumi2 + sumi3 + sumi4; |
1189 | 0 | } |
1190 | 0 | sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
1191 | 0 | } |
1192 | 0 | } |
1193 | 0 | } |
1194 | 0 | for(int sb = 0; sb < 8; sb++) { |
1195 | 0 | const uint8_t *mins = b_ptr[l].scales + sb * 16; |
1196 | 0 | for(int m = 0; m < 4; m++) { |
1197 | 0 | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
1198 | 0 | for(int j = 0; j < ncols_interleaved; j++) { |
1199 | 0 | int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]); |
1200 | 0 | sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
1201 | 0 | } |
1202 | 0 | } |
1203 | 0 | } |
1204 | 0 | } |
1205 | |
|
1206 | 0 | for (int m = 0; m < 4; m++) { |
1207 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1208 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
1209 | 0 | } |
1210 | 0 | } |
1211 | 0 | } |
1212 | 0 | } |
1213 | 0 | } |
1214 | | |
1215 | | |
1216 | 0 | void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
1217 | 0 | const int qk = QK8_0; |
1218 | 0 | const int nb = n / qk; |
1219 | 0 | const int ncols_interleaved = 4; |
1220 | 0 | const int blocklen = 4; |
1221 | |
|
1222 | 0 | assert (n % qk == 0); |
1223 | 0 | assert (nr % 4 == 0); |
1224 | 0 | assert (nc % ncols_interleaved == 0); |
1225 | |
|
1226 | 0 | UNUSED(s); |
1227 | 0 | UNUSED(bs); |
1228 | 0 | UNUSED(vx); |
1229 | 0 | UNUSED(vy); |
1230 | 0 | UNUSED(nr); |
1231 | 0 | UNUSED(nc); |
1232 | 0 | UNUSED(nb); |
1233 | 0 | UNUSED(ncols_interleaved); |
1234 | 0 | UNUSED(blocklen); |
1235 | |
|
1236 | 0 | { |
1237 | 0 | float sumf[4][4]; |
1238 | 0 | int sumi; |
1239 | |
|
1240 | 0 | for (int y = 0; y < nr / 4; y++) { |
1241 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
1242 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1243 | 0 | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
1244 | 0 | for (int m = 0; m < 4; m++) { |
1245 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
1246 | 0 | } |
1247 | 0 | for (int l = 0; l < nb; l++) { |
1248 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
1249 | 0 | for (int m = 0; m < 4; m++) { |
1250 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1251 | 0 | sumi = 0; |
1252 | 0 | for (int i = 0; i < blocklen; ++i) { |
1253 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
1254 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
1255 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
1256 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); |
1257 | 0 | } |
1258 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
1259 | 0 | } |
1260 | 0 | } |
1261 | 0 | } |
1262 | 0 | } |
1263 | 0 | for (int m = 0; m < 4; m++) { |
1264 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
1265 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
1266 | 0 | } |
1267 | 0 | } |
1268 | 0 | } |
1269 | 0 | } |
1270 | 0 | } |
1271 | | |
1272 | 0 | void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
1273 | 0 | const int qk = QK8_0; |
1274 | 0 | const int nb = n / qk; |
1275 | 0 | const int ncols_interleaved = 8; |
1276 | 0 | const int blocklen = 8; |
1277 | |
|
1278 | 0 | assert(n % qk == 0); |
1279 | 0 | assert(nr % 4 == 0); |
1280 | 0 | assert(nc % ncols_interleaved == 0); |
1281 | |
|
1282 | 0 | float sumf[4][8]; |
1283 | 0 | int sumi; |
1284 | |
|
1285 | 0 | for (int y = 0; y < nr / 4; y++) { |
1286 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
1287 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1288 | 0 | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb); |
1289 | 0 | for (int m = 0; m < 4; m++) { |
1290 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
1291 | 0 | } |
1292 | 0 | for (int l = 0; l < nb; l++) { |
1293 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
1294 | 0 | for (int m = 0; m < 4; m++) { |
1295 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1296 | 0 | sumi = 0; |
1297 | 0 | for (int i = 0; i < blocklen; ++i) { |
1298 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
1299 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
1300 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
1301 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); |
1302 | 0 | } |
1303 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
1304 | 0 | } |
1305 | 0 | } |
1306 | 0 | } |
1307 | 0 | } |
1308 | 0 | for (int m = 0; m < 4; m++) { |
1309 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
1310 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
1311 | 0 | } |
1312 | 0 | } |
1313 | 0 | } |
1314 | 0 | } |
1315 | | |
1316 | | void ggml_gemm_q8_0_4x4_q8_0_generic(int n, |
1317 | | float * GGML_RESTRICT s, |
1318 | | size_t bs, |
1319 | | const void * GGML_RESTRICT vx, |
1320 | | const void * GGML_RESTRICT vy, |
1321 | | int nr, |
1322 | 0 | int nc) { |
1323 | 0 | const int qk = QK8_0; |
1324 | 0 | const int nb = n / qk; |
1325 | 0 | const int ncols_interleaved = 4; |
1326 | 0 | const int blocklen = 4; |
1327 | |
|
1328 | 0 | assert(n % qk == 0); |
1329 | 0 | assert(nr % 4 == 0); |
1330 | 0 | assert(nc % ncols_interleaved == 0); |
1331 | |
|
1332 | 0 | float sumf[4][4]; |
1333 | 0 | int sumi; |
1334 | |
|
1335 | 0 | for (int y = 0; y < nr / 4; y++) { |
1336 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
1337 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1338 | 0 | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb); |
1339 | 0 | for (int m = 0; m < 4; m++) { |
1340 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1341 | 0 | sumf[m][j] = 0.0; |
1342 | 0 | } |
1343 | 0 | } |
1344 | 0 | for (int l = 0; l < nb; l++) { |
1345 | 0 | for (int k = 0; k < (qk / blocklen); k++) { |
1346 | 0 | for (int m = 0; m < 4; m++) { |
1347 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1348 | 0 | sumi = 0; |
1349 | 0 | for (int i = 0; i < blocklen; ++i) { |
1350 | 0 | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]; |
1351 | 0 | sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]; |
1352 | 0 | } |
1353 | 0 | sumf[m][j] += |
1354 | 0 | sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
1355 | 0 | } |
1356 | 0 | } |
1357 | 0 | } |
1358 | 0 | } |
1359 | 0 | for (int m = 0; m < 4; m++) { |
1360 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1361 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
1362 | 0 | } |
1363 | 0 | } |
1364 | 0 | } |
1365 | 0 | } |
1366 | 0 | } |
1367 | | |
1368 | | void ggml_gemm_q8_0_4x8_q8_0_generic(int n, |
1369 | | float * GGML_RESTRICT s, |
1370 | | size_t bs, |
1371 | | const void * GGML_RESTRICT vx, |
1372 | | const void * GGML_RESTRICT vy, |
1373 | | int nr, |
1374 | 0 | int nc) { |
1375 | 0 | const int qk = QK8_0; |
1376 | 0 | const int nb = n / qk; |
1377 | 0 | const int ncols_interleaved = 4; |
1378 | 0 | const int blocklen = 8; |
1379 | |
|
1380 | 0 | assert(n % qk == 0); |
1381 | 0 | assert(nr % 4 == 0); |
1382 | 0 | assert(nc % ncols_interleaved == 0); |
1383 | |
|
1384 | 0 | float sumf[4][4]; |
1385 | 0 | int sumi; |
1386 | |
|
1387 | 0 | for (int y = 0; y < nr / 4; y++) { |
1388 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
1389 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
1390 | 0 | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb); |
1391 | 0 | for (int m = 0; m < 4; m++) { |
1392 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1393 | 0 | sumf[m][j] = 0.0; |
1394 | 0 | } |
1395 | 0 | } |
1396 | 0 | for (int l = 0; l < nb; l++) { |
1397 | 0 | for (int k = 0; k < (qk / blocklen); k++) { |
1398 | 0 | for (int m = 0; m < 4; m++) { |
1399 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1400 | 0 | sumi = 0; |
1401 | 0 | for (int i = 0; i < blocklen; ++i) { |
1402 | 0 | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]; |
1403 | 0 | sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]; |
1404 | 0 | } |
1405 | 0 | sumf[m][j] += |
1406 | 0 | sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
1407 | 0 | } |
1408 | 0 | } |
1409 | 0 | } |
1410 | 0 | } |
1411 | 0 | for (int m = 0; m < 4; m++) { |
1412 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
1413 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
1414 | 0 | } |
1415 | 0 | } |
1416 | 0 | } |
1417 | 0 | } |
1418 | 0 | } |
1419 | | |
1420 | | } // extern "C" |
1421 | | |
1422 | 0 | static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) { |
1423 | 0 | block_q8_0x4 out; |
1424 | |
|
1425 | 0 | for (int i = 0; i < 4; i++) { |
1426 | 0 | out.d[i] = in[i].d; |
1427 | 0 | } |
1428 | |
|
1429 | 0 | const int end = QK8_0 * 4 / blck_size_interleave; |
1430 | 0 | for (int i = 0; i < end; ++i) { |
1431 | 0 | int src_id = i % 4; |
1432 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1433 | 0 | int dst_offset = i * blck_size_interleave; |
1434 | 0 | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave); |
1435 | 0 | } |
1436 | 0 | return out; |
1437 | 0 | } |
1438 | | |
1439 | 0 | static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { |
1440 | 0 | block_q4_0x4 out; |
1441 | |
|
1442 | 0 | for (int i = 0; i < 4; i++) { |
1443 | 0 | out.d[i] = in[i].d; |
1444 | 0 | } |
1445 | |
|
1446 | 0 | const int end = QK4_0 * 2 / blck_size_interleave; |
1447 | |
|
1448 | 0 | if (blck_size_interleave == 8) { |
1449 | 0 | const uint64_t xor_mask = 0x8888888888888888ULL; |
1450 | 0 | for (int i = 0; i < end; ++i) { |
1451 | 0 | int src_id = i % 4; |
1452 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1453 | 0 | int dst_offset = i * blck_size_interleave; |
1454 | |
|
1455 | 0 | uint64_t elems; |
1456 | | // Using memcpy to avoid unaligned memory accesses |
1457 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1458 | 0 | elems ^= xor_mask; |
1459 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1460 | 0 | } |
1461 | 0 | } else if (blck_size_interleave == 4) { |
1462 | 0 | const uint32_t xor_mask = 0x88888888; |
1463 | 0 | for (int i = 0; i < end; ++i) { |
1464 | 0 | int src_id = i % 4; |
1465 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1466 | 0 | int dst_offset = i * blck_size_interleave; |
1467 | |
|
1468 | 0 | uint32_t elems; |
1469 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); |
1470 | 0 | elems ^= xor_mask; |
1471 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); |
1472 | 0 | } |
1473 | 0 | } else { |
1474 | 0 | GGML_ASSERT(false); |
1475 | 0 | } |
1476 | |
|
1477 | 0 | return out; |
1478 | 0 | } |
1479 | | |
1480 | | // interleave 8 block_q4_0s in blocks of blck_size_interleave |
1481 | | // returns an interleaved block_q4_0x8 |
1482 | | // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks |
1483 | | // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave |
1484 | 0 | static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { |
1485 | 0 | block_q4_0x8 out; |
1486 | |
|
1487 | 0 | for (int i = 0; i < 8; i++) { |
1488 | 0 | out.d[i] = in[i].d; |
1489 | 0 | } |
1490 | |
|
1491 | 0 | const int end = QK4_0 * 4 / blck_size_interleave; |
1492 | 0 | const uint64_t xor_mask = 0x8888888888888888ULL; |
1493 | |
|
1494 | 0 | for (int i = 0; i < end; ++i) { |
1495 | 0 | int src_id = i % 8; |
1496 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1497 | 0 | int dst_offset = i * blck_size_interleave; |
1498 | |
|
1499 | 0 | uint64_t elems; |
1500 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1501 | 0 | elems ^= xor_mask; |
1502 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1503 | 0 | } |
1504 | |
|
1505 | 0 | return out; |
1506 | 0 | } |
1507 | | |
1508 | 0 | static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) { |
1509 | 0 | block_q4_Kx8 out; |
1510 | | //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure |
1511 | 0 | for (int i = 0; i < 8; i++) { |
1512 | 0 | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; |
1513 | 0 | } |
1514 | |
|
1515 | 0 | for (int i = 0; i < 8; i++) { |
1516 | 0 | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; |
1517 | 0 | } |
1518 | |
|
1519 | 0 | const int end = QK_K * 4 / blck_size_interleave; |
1520 | | |
1521 | | // Interleave Q4_K quants by taking 8 bytes at a time |
1522 | 0 | for (int i = 0; i < end; ++i) { |
1523 | 0 | int src_id = i % 8; |
1524 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1525 | 0 | int dst_offset = i * blck_size_interleave; |
1526 | |
|
1527 | 0 | uint64_t elems; |
1528 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1529 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1530 | 0 | } |
1531 | | |
1532 | | // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K |
1533 | | // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value) |
1534 | | // The output Q4_Kx8 structure has 96 bytes |
1535 | | // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure |
1536 | | // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures |
1537 | 0 | uint8_t s[8], m[8]; |
1538 | |
|
1539 | 0 | for (int i = 0; i < 4; i++) { |
1540 | 0 | for (int j = 0; j < 8; j++) { |
1541 | 0 | s[j] = in[j].scales[i] & 63; |
1542 | 0 | m[j] = in[j].scales[i + 4] & 63; |
1543 | 0 | } |
1544 | |
|
1545 | 0 | out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2); |
1546 | 0 | out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2); |
1547 | 0 | out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2); |
1548 | 0 | out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2); |
1549 | 0 | out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2); |
1550 | 0 | out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2); |
1551 | 0 | out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2); |
1552 | 0 | out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2); |
1553 | 0 | out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4); |
1554 | 0 | out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4); |
1555 | 0 | out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4); |
1556 | 0 | out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4); |
1557 | |
|
1558 | 0 | } |
1559 | |
|
1560 | 0 | for (int i = 0; i < 4; i++) { |
1561 | 0 | for (int j = 0; j < 8; j++) { |
1562 | 0 | s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15); |
1563 | 0 | m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4); |
1564 | 0 | } |
1565 | |
|
1566 | 0 | out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2); |
1567 | 0 | out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2); |
1568 | 0 | out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2); |
1569 | 0 | out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2); |
1570 | 0 | out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2); |
1571 | 0 | out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2); |
1572 | 0 | out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2); |
1573 | 0 | out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2); |
1574 | 0 | out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4); |
1575 | 0 | out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4); |
1576 | 0 | out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4); |
1577 | 0 | out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4); |
1578 | |
|
1579 | 0 | } |
1580 | |
|
1581 | 0 | return out; |
1582 | 0 | } |
1583 | | |
1584 | 0 | static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) { |
1585 | 0 | block_q2_Kx8 out; |
1586 | | |
1587 | | // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure |
1588 | 0 | for (int i = 0; i < 8; i++) { |
1589 | 0 | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; |
1590 | 0 | } |
1591 | |
|
1592 | 0 | for (int i = 0; i < 8; i++) { |
1593 | 0 | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; |
1594 | 0 | } |
1595 | |
|
1596 | 0 | const int end = QK_K * 2 / blck_size_interleave; |
1597 | | |
1598 | | // Interleave Q2_K quants by taking 8 bytes at a time |
1599 | 0 | for (int i = 0; i < end; ++i) { |
1600 | 0 | int src_id = i % 8; |
1601 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1602 | 0 | int dst_offset = i * blck_size_interleave; |
1603 | |
|
1604 | 0 | uint64_t elems; |
1605 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1606 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1607 | 0 | } |
1608 | | |
1609 | | // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K |
1610 | | // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value) |
1611 | | // The output Q2_Kx8 structure has 128 bytes for storing scales and mins |
1612 | | // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure |
1613 | | // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures |
1614 | |
|
1615 | 0 | for(int i = 0; i < 128; i++){ |
1616 | | |
1617 | | // Index for selecting which q2k super block |
1618 | 0 | int src1 = (i % 16) / 2; |
1619 | | // Index for selecting scale |
1620 | 0 | int src2 = ((i / 16) * 2) + (i % 2); |
1621 | |
|
1622 | 0 | out.scales[i] = in[src1].scales[src2]; |
1623 | 0 | } |
1624 | 0 | return out; |
1625 | |
|
1626 | 0 | } |
1627 | | |
1628 | 0 | static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1629 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
1630 | 0 | GGML_ASSERT(interleave_block == 4 || interleave_block == 8); |
1631 | 0 | constexpr int nrows_interleaved = 4; |
1632 | |
|
1633 | 0 | block_q4_0x4 * dst = (block_q4_0x4 *)t->data; |
1634 | 0 | const block_q4_0 * src = (const block_q4_0 *)data; |
1635 | 0 | block_q4_0 dst_tmp[4]; |
1636 | 0 | int nrow = ggml_nrows(t); |
1637 | 0 | int nblocks = t->ne[0] / QK4_0; |
1638 | |
|
1639 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
1640 | |
|
1641 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1642 | 0 | return -1; |
1643 | 0 | } |
1644 | | |
1645 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1646 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1647 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1648 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1649 | 0 | } |
1650 | 0 | *dst++ = make_block_q4_0x4(dst_tmp, interleave_block); |
1651 | 0 | } |
1652 | 0 | src += nrows_interleaved * nblocks; |
1653 | 0 | } |
1654 | 0 | return 0; |
1655 | | |
1656 | 0 | GGML_UNUSED(data_size); |
1657 | 0 | } |
1658 | | |
1659 | 0 | static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1660 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_K); |
1661 | 0 | GGML_ASSERT(interleave_block == 8 || interleave_block == 4); |
1662 | 0 | constexpr int nrows_interleaved = 8; |
1663 | |
|
1664 | 0 | block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; |
1665 | 0 | const block_q4_K * src = (const block_q4_K*) data; |
1666 | 0 | block_q4_K dst_tmp[8]; |
1667 | 0 | int nrow = ggml_nrows(t); |
1668 | 0 | int nblocks = t->ne[0] / QK_K; |
1669 | |
|
1670 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); |
1671 | |
|
1672 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1673 | 0 | return -1; |
1674 | 0 | } |
1675 | | |
1676 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1677 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1678 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1679 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1680 | 0 | } |
1681 | 0 | *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); |
1682 | 0 | } |
1683 | 0 | src += nrows_interleaved * nblocks; |
1684 | 0 | } |
1685 | 0 | return 0; |
1686 | | |
1687 | 0 | GGML_UNUSED(data_size); |
1688 | 0 | } |
1689 | | |
1690 | 0 | static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1691 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q2_K); |
1692 | 0 | GGML_ASSERT(interleave_block == 8); |
1693 | 0 | constexpr int nrows_interleaved = 8; |
1694 | |
|
1695 | 0 | block_q2_Kx8 * dst = (block_q2_Kx8*)t->data; |
1696 | 0 | const block_q2_K * src = (const block_q2_K*) data; |
1697 | 0 | block_q2_K dst_tmp[8]; |
1698 | 0 | int nrow = ggml_nrows(t); |
1699 | 0 | int nblocks = t->ne[0] / QK_K; |
1700 | |
|
1701 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K)); |
1702 | |
|
1703 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1704 | 0 | return -1; |
1705 | 0 | } |
1706 | | |
1707 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1708 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1709 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1710 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1711 | 0 | } |
1712 | 0 | *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block); |
1713 | 0 | } |
1714 | 0 | src += nrows_interleaved * nblocks; |
1715 | 0 | } |
1716 | 0 | return 0; |
1717 | | |
1718 | 0 | GGML_UNUSED(data_size); |
1719 | 0 | } |
1720 | | |
1721 | 0 | static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1722 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
1723 | 0 | GGML_ASSERT(interleave_block == 8); |
1724 | 0 | constexpr int nrows_interleaved = 8; |
1725 | |
|
1726 | 0 | block_q4_0x8 * dst = (block_q4_0x8*)t->data; |
1727 | 0 | const block_q4_0 * src = (const block_q4_0*) data; |
1728 | 0 | block_q4_0 dst_tmp[8]; |
1729 | 0 | int nrow = ggml_nrows(t); |
1730 | 0 | int nblocks = t->ne[0] / QK4_0; |
1731 | |
|
1732 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
1733 | |
|
1734 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1735 | 0 | return -1; |
1736 | 0 | } |
1737 | | |
1738 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1739 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1740 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1741 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1742 | 0 | } |
1743 | 0 | *dst++ = make_block_q4_0x8(dst_tmp, interleave_block); |
1744 | 0 | } |
1745 | 0 | src += nrows_interleaved * nblocks; |
1746 | 0 | } |
1747 | 0 | return 0; |
1748 | | |
1749 | 0 | GGML_UNUSED(data_size); |
1750 | 0 | } |
1751 | | |
1752 | | static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t, |
1753 | | int interleave_block, |
1754 | | const void * GGML_RESTRICT data, |
1755 | 0 | size_t data_size) { |
1756 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q8_0); |
1757 | 0 | GGML_ASSERT(interleave_block == 4 || interleave_block == 8); |
1758 | 0 | constexpr int nrows_interleaved = 4; |
1759 | |
|
1760 | 0 | block_q8_0x4 * dst = (block_q8_0x4 *) t->data; |
1761 | 0 | const block_q8_0 * src = (const block_q8_0 *) data; |
1762 | 0 | block_q8_0 dst_tmp[4]; |
1763 | 0 | int nrow = ggml_nrows(t); |
1764 | 0 | int nblocks = t->ne[0] / QK8_0; |
1765 | |
|
1766 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0)); |
1767 | |
|
1768 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1769 | 0 | return -1; |
1770 | 0 | } |
1771 | | |
1772 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1773 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1774 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1775 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1776 | 0 | } |
1777 | 0 | *dst++ = make_block_q8_0x4(dst_tmp, interleave_block); |
1778 | 0 | } |
1779 | 0 | src += nrows_interleaved * nblocks; |
1780 | 0 | } |
1781 | 0 | return 0; |
1782 | 0 | } |
1783 | | |
1784 | 0 | static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { |
1785 | 0 | block_iq4_nlx4 out; |
1786 | |
|
1787 | 0 | for (int i = 0; i < 4; i++) { |
1788 | 0 | out.d[i] = in[i].d; |
1789 | 0 | } |
1790 | |
|
1791 | 0 | const int end = QK4_NL * 2 / blck_size_interleave; |
1792 | | |
1793 | | // TODO: this branch seems wrong |
1794 | | //if (blck_size_interleave == 8) { |
1795 | | // for (int i = 0; i < end; ++i) { |
1796 | | // int src_id = i % 4; |
1797 | | // int src_offset = (i / 4) * blck_size_interleave; |
1798 | | // int dst_offset = i * blck_size_interleave; |
1799 | | |
1800 | | // // Using memcpy to avoid unaligned memory accesses |
1801 | | // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1802 | | // } |
1803 | | //} else |
1804 | 0 | if (blck_size_interleave == 4) { |
1805 | 0 | for (int i = 0; i < end; ++i) { |
1806 | 0 | int src_id = i % 4; |
1807 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1808 | 0 | int dst_offset = i * blck_size_interleave; |
1809 | |
|
1810 | 0 | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); |
1811 | 0 | } |
1812 | 0 | } else { |
1813 | 0 | GGML_ASSERT(false); |
1814 | 0 | } |
1815 | |
|
1816 | 0 | return out; |
1817 | 0 | } |
1818 | | |
1819 | 0 | static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1820 | 0 | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); |
1821 | 0 | GGML_ASSERT(interleave_block == 4); |
1822 | |
|
1823 | 0 | const block_iq4_nl * src = (const block_iq4_nl *)data; |
1824 | 0 | block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data; |
1825 | |
|
1826 | 0 | block_iq4_nl dst_tmp[4]; |
1827 | |
|
1828 | 0 | int nrow = ggml_nrows(t); |
1829 | 0 | int nrows_interleaved = 4; |
1830 | 0 | int nblocks = t->ne[0] / QK4_NL; |
1831 | |
|
1832 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); |
1833 | |
|
1834 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1835 | 0 | return -1; |
1836 | 0 | } |
1837 | | |
1838 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1839 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1840 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1841 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1842 | 0 | } |
1843 | 0 | *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); |
1844 | 0 | } |
1845 | 0 | src += nrows_interleaved * nblocks; |
1846 | 0 | } |
1847 | 0 | return 0; |
1848 | | |
1849 | 0 | GGML_UNUSED(data_size); |
1850 | 0 | } |
1851 | | |
1852 | 0 | static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) { |
1853 | 0 | block_iq4_nlx8 out; |
1854 | |
|
1855 | 0 | for (int i = 0; i < 8; i++) { |
1856 | 0 | out.d[i] = in[i].d; |
1857 | 0 | } |
1858 | |
|
1859 | 0 | const int end = QK4_NL * 4 / blck_size_interleave; |
1860 | |
|
1861 | 0 | if (blck_size_interleave == 8) { |
1862 | 0 | for (int i = 0; i < end; ++i) { |
1863 | 0 | int src_id = i % 8; |
1864 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1865 | 0 | int dst_offset = i * blck_size_interleave; |
1866 | |
|
1867 | 0 | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1868 | 0 | } |
1869 | 0 | } else { |
1870 | 0 | GGML_ASSERT(false); |
1871 | 0 | } |
1872 | |
|
1873 | 0 | return out; |
1874 | 0 | } |
1875 | | |
1876 | 0 | static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1877 | 0 | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); |
1878 | 0 | GGML_ASSERT(interleave_block == 8); |
1879 | |
|
1880 | 0 | const block_iq4_nl * src = (const block_iq4_nl *)data; |
1881 | 0 | block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data; |
1882 | |
|
1883 | 0 | block_iq4_nl dst_tmp[8]; |
1884 | |
|
1885 | 0 | int nrow = ggml_nrows(t); |
1886 | 0 | int nrows_interleaved = 8; |
1887 | 0 | int nblocks = t->ne[0] / QK4_NL; |
1888 | |
|
1889 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); |
1890 | |
|
1891 | 0 | if (t->ne[1] % nrows_interleaved != 0) { |
1892 | 0 | return -1; |
1893 | 0 | } |
1894 | | |
1895 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1896 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1897 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1898 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1899 | 0 | } |
1900 | 0 | *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block); |
1901 | 0 | } |
1902 | 0 | src += nrows_interleaved * nblocks; |
1903 | 0 | } |
1904 | 0 | return 0; |
1905 | | |
1906 | 0 | GGML_UNUSED(data_size); |
1907 | 0 | } |
1908 | | |
1909 | | namespace ggml::cpu::repack { |
1910 | | // repack |
1911 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> |
1912 | | int repack(struct ggml_tensor *, const void *, size_t); |
1913 | | |
1914 | | // TODO: generalise. |
1915 | 0 | template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1916 | 0 | return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); |
1917 | 0 | } |
1918 | | |
1919 | 0 | template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1920 | 0 | return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); |
1921 | 0 | } |
1922 | | |
1923 | 0 | template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1924 | 0 | return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); |
1925 | 0 | } |
1926 | | |
1927 | 0 | template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1928 | 0 | return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size); |
1929 | 0 | } |
1930 | | |
1931 | 0 | template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1932 | 0 | return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size); |
1933 | 0 | } |
1934 | | |
1935 | 0 | template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1936 | 0 | return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size); |
1937 | 0 | } |
1938 | | |
1939 | 0 | template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1940 | 0 | return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); |
1941 | 0 | } |
1942 | | |
1943 | | // TODO: needs to be revisited |
1944 | | //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1945 | | // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); |
1946 | | //} |
1947 | | |
1948 | 0 | template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1949 | 0 | return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size); |
1950 | 0 | } |
1951 | | |
1952 | 0 | template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1953 | 0 | return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size); |
1954 | 0 | } |
1955 | | |
1956 | 0 | template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1957 | 0 | return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size); |
1958 | 0 | } |
1959 | | |
1960 | | // gemv |
1961 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
1962 | | void gemv(int, float *, size_t, const void *, const void *, int, int); |
1963 | | |
1964 | 0 | template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1965 | 0 | ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1966 | 0 | } |
1967 | | |
1968 | 0 | template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1969 | 0 | ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1970 | 0 | } |
1971 | | |
1972 | 0 | template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1973 | 0 | ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1974 | 0 | } |
1975 | | |
1976 | 0 | template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1977 | 0 | ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc); |
1978 | 0 | } |
1979 | | |
1980 | 0 | template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1981 | 0 | ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1982 | 0 | } |
1983 | | |
1984 | 0 | template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1985 | 0 | ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1986 | 0 | } |
1987 | | |
1988 | 0 | template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1989 | 0 | ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1990 | 0 | } |
1991 | | |
1992 | 0 | template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1993 | 0 | ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1994 | 0 | } |
1995 | | |
1996 | 0 | template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1997 | 0 | ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1998 | 0 | } |
1999 | | |
2000 | 0 | template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2001 | 0 | ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
2002 | 0 | } |
2003 | | |
2004 | | // gemm |
2005 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
2006 | | void gemm(int, float *, size_t, const void *, const void *, int, int); |
2007 | | |
2008 | 0 | template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2009 | 0 | ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
2010 | 0 | } |
2011 | | |
2012 | 0 | template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2013 | 0 | ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
2014 | 0 | } |
2015 | | |
2016 | 0 | template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2017 | 0 | ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc); |
2018 | 0 | } |
2019 | | |
2020 | 0 | template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2021 | 0 | ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
2022 | 0 | } |
2023 | | |
2024 | 0 | template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2025 | 0 | ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
2026 | 0 | } |
2027 | | |
2028 | 0 | template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2029 | 0 | ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
2030 | 0 | } |
2031 | | |
2032 | 0 | template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2033 | 0 | ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
2034 | 0 | } |
2035 | | |
2036 | 0 | template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2037 | 0 | ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
2038 | 0 | } |
2039 | | |
2040 | 0 | template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2041 | 0 | ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
2042 | 0 | } |
2043 | | |
2044 | 0 | template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
2045 | 0 | ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
2046 | 0 | } |
2047 | | |
2048 | | class tensor_traits_base : public ggml::cpu::tensor_traits { |
2049 | | public: |
2050 | | virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; |
2051 | | }; |
2052 | | |
2053 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base { |
2054 | | |
2055 | 0 | bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { |
2056 | | // not realy a GGML_TYPE_Q8_0 but same size. |
2057 | 0 | switch (op->op) { |
2058 | 0 | case GGML_OP_MUL_MAT: |
2059 | 0 | { |
2060 | 0 | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
2061 | 0 | return true; |
2062 | 0 | } |
2063 | 0 | case GGML_OP_MUL_MAT_ID: |
2064 | 0 | { |
2065 | 0 | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
2066 | 0 | size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. |
2067 | |
|
2068 | 0 | const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert |
2069 | 0 | const int64_t ne12 = op->src[1]->ne[2]; // n_tokens |
2070 | |
|
2071 | 0 | const size_t sizeof_mmid_row_mapping = sizeof(int64_t); |
2072 | |
|
2073 | 0 | size += sizeof_mmid_row_mapping*ne02*(ne12 + 1); |
2074 | |
|
2075 | 0 | return true; |
2076 | 0 | } |
2077 | 0 | default: |
2078 | | // GGML_ABORT("fatal error"); |
2079 | 0 | break; |
2080 | 0 | } |
2081 | 0 | return false; |
2082 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) |
2083 | | |
2084 | 0 | bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { |
2085 | 0 | switch (op->op) { |
2086 | 0 | case GGML_OP_MUL_MAT: |
2087 | 0 | forward_mul_mat(params, op); |
2088 | 0 | return true; |
2089 | 0 | case GGML_OP_MUL_MAT_ID: |
2090 | 0 | forward_mul_mat_id(params, op); |
2091 | 0 | return true; |
2092 | 0 | default: |
2093 | | // GGML_ABORT("fatal error"); |
2094 | 0 | break; |
2095 | 0 | } |
2096 | 0 | return false; |
2097 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) |
2098 | | |
2099 | | void forward_mul_mat_one_chunk(ggml_compute_params * params, |
2100 | | ggml_tensor * op, |
2101 | | int64_t src0_start, |
2102 | | int64_t src0_end, |
2103 | | int64_t src1_start, |
2104 | 0 | int64_t src1_end) { |
2105 | 0 | const ggml_tensor * src0 = op->src[0]; |
2106 | 0 | const ggml_tensor * src1 = op->src[1]; |
2107 | 0 | ggml_tensor * dst = op; |
2108 | |
|
2109 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
2110 | |
|
2111 | 0 | const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); |
2112 | |
|
2113 | 0 | GGML_ASSERT(ne03 == 1 && ne13 == 1); |
2114 | 0 | GGML_ASSERT(ne12 % ne02 == 0); |
2115 | 0 | const int64_t r2 = ne12 / ne02; |
2116 | |
|
2117 | 0 | const int64_t i12 = src1_start / ne1; |
2118 | 0 | const int64_t i11 = src1_start - i12 * ne1; |
2119 | | |
2120 | | // Determine batch index |
2121 | 0 | const int64_t i02 = i12 / r2; |
2122 | |
|
2123 | 0 | const int64_t i1 = i11; |
2124 | 0 | const int64_t i2 = i12; |
2125 | |
|
2126 | 0 | const char * src0_ptr = (const char *) src0->data + i02 * nb02; |
2127 | 0 | const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride; |
2128 | 0 | char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2)); |
2129 | |
|
2130 | 0 | const int64_t nrows = src1_end - src1_start; |
2131 | 0 | const int64_t ncols = src0_end - src0_start; |
2132 | |
|
2133 | 0 | GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize); |
2134 | | |
2135 | | // If there are more than three rows in src1, use gemm; otherwise, use gemv. |
2136 | 0 | if (nrows > 3) { |
2137 | 0 | gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0, |
2138 | 0 | src0_ptr + src0_start * nb01, src1_ptr, |
2139 | 0 | nrows - (nrows % 4), ncols); |
2140 | 0 | } |
2141 | 0 | for (int iter = nrows - (nrows % 4); iter < nrows; iter++) { |
2142 | 0 | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start, |
2143 | 0 | ne01, src0_ptr + src0_start * nb01, |
2144 | 0 | src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols); |
2145 | 0 | } |
2146 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) |
2147 | | |
2148 | 0 | void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { |
2149 | 0 | const ggml_tensor * src0 = op->src[0]; |
2150 | 0 | const ggml_tensor * src1 = op->src[1]; |
2151 | 0 | ggml_tensor * dst = op; |
2152 | |
|
2153 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
2154 | |
|
2155 | 0 | const int ith = params->ith; |
2156 | 0 | const int nth = params->nth; |
2157 | |
|
2158 | 0 | GGML_ASSERT(ne0 == ne01); |
2159 | 0 | GGML_ASSERT(ne1 == ne11); |
2160 | 0 | GGML_ASSERT(ne2 == ne12); |
2161 | 0 | GGML_ASSERT(ne3 == ne13); |
2162 | | |
2163 | | // dst cannot be transposed or permuted |
2164 | 0 | GGML_ASSERT(nb0 == sizeof(float)); |
2165 | 0 | GGML_ASSERT(nb0 <= nb1); |
2166 | 0 | GGML_ASSERT(nb1 <= nb2); |
2167 | 0 | GGML_ASSERT(nb2 <= nb3); |
2168 | | |
2169 | | // TODO: General batched mul mat for 4D tensors |
2170 | | // Currently only supports 3D tensors |
2171 | 0 | GGML_ASSERT(ne03 == 1); |
2172 | 0 | GGML_ASSERT(ne13 == 1); |
2173 | 0 | GGML_ASSERT(ne3 == 1); |
2174 | |
|
2175 | 0 | GGML_ASSERT(src1->type == GGML_TYPE_F32); |
2176 | |
|
2177 | 0 | GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); |
2178 | | // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); |
2179 | |
|
2180 | 0 | char * wdata = static_cast<char *>(params->wdata); |
2181 | 0 | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
2182 | 0 | const size_t nbw2 = nbw1 * ne11; |
2183 | |
|
2184 | 0 | assert(params->wsize >= nbw2 * ne12); |
2185 | |
|
2186 | 0 | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
2187 | | |
2188 | | // INFO: Quantization is done in planes to avoid extra complexity in chunking. |
2189 | | // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how |
2190 | | // the planes are broadcast. |
2191 | 0 | for (int64_t i12 = 0; i12 < ne12; i12++) { |
2192 | 0 | char * data_ptr = (char *) src1->data + i12 * nb12; |
2193 | 0 | char * wdata_ptr = wdata + i12 * nbw2; |
2194 | |
|
2195 | 0 | for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { |
2196 | 0 | ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11), |
2197 | 0 | (void *) (wdata_ptr + i11 * nbw1), 4, ne10); |
2198 | 0 | } |
2199 | |
|
2200 | 0 | const int64_t i11_processed = ne11 - ne11 % 4; |
2201 | 0 | for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { |
2202 | 0 | from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10); |
2203 | 0 | } |
2204 | 0 | } |
2205 | | |
2206 | | // disable for NUMA |
2207 | 0 | const bool disable_chunking = ggml_is_numa(); |
2208 | | |
2209 | | // 4x chunks per thread |
2210 | 0 | const int64_t nr0 = ggml_nrows(op->src[0]); |
2211 | |
|
2212 | 0 | int nth_scaled = nth * 4; |
2213 | 0 | int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled; |
2214 | 0 | int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0; |
2215 | | |
2216 | | // src1 is chunked only by full planes. |
2217 | | // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE |
2218 | | // to route them thorugh GEMV. |
2219 | | // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors |
2220 | | // to avoid affecting their performance |
2221 | 0 | int64_t nchunk1 = ne12; |
2222 | | |
2223 | | // Ensure minimum chunk size to avoid alignment issues with high thread counts |
2224 | | // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment |
2225 | 0 | const int64_t min_chunk_size = NB_COLS; |
2226 | 0 | if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) { |
2227 | 0 | nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size; |
2228 | 0 | } |
2229 | |
|
2230 | 0 | int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; |
2231 | | // Only increase nchunk0 to nth if it won't make chunks too small |
2232 | 0 | if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) { |
2233 | 0 | nchunk0 = nth; |
2234 | 0 | dr0 = (nr0 + nchunk0 - 1) / nchunk0; |
2235 | 0 | } |
2236 | | |
2237 | | // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size |
2238 | | // This prevents creating too many tiny chunks that could overlap after alignment |
2239 | 0 | const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size; |
2240 | 0 | nchunk0 = MIN(nchunk0, max_nchunk); |
2241 | |
|
2242 | 0 | if (ith == 0) { |
2243 | | // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. |
2244 | 0 | ggml_threadpool_chunk_set(params->threadpool, nth); |
2245 | 0 | } |
2246 | |
|
2247 | 0 | ggml_barrier(params->threadpool); |
2248 | | |
2249 | | // The first chunk comes from our thread_id, the rest will get auto-assigned. |
2250 | 0 | int current_chunk = ith; |
2251 | |
|
2252 | 0 | while (current_chunk < nchunk0 * nchunk1) { |
2253 | 0 | const int64_t ith0 = current_chunk % nchunk0; |
2254 | 0 | const int64_t ith1 = current_chunk / nchunk0; |
2255 | |
|
2256 | 0 | int64_t src0_start = dr0 * ith0; |
2257 | 0 | int64_t src0_end = MIN(src0_start + dr0, nr0); |
2258 | | |
2259 | | // full-plane range for src1 |
2260 | 0 | int64_t src1_start = ith1 * ne11; |
2261 | 0 | int64_t src1_end = (ith1 + 1) * ne11; |
2262 | | |
2263 | | // Align boundaries to NB_COLS - round up to ensure all data is included |
2264 | | // The chunk size limiting above ensures chunks are large enough to prevent overlaps |
2265 | 0 | src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; |
2266 | 0 | src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; |
2267 | 0 | src0_end = MIN(src0_end, ne01); |
2268 | | |
2269 | | // Make sure current plane is the last one before exiting |
2270 | 0 | if (src0_start >= src0_end) { |
2271 | 0 | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); |
2272 | 0 | continue; |
2273 | 0 | } |
2274 | | |
2275 | 0 | forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end); |
2276 | |
|
2277 | 0 | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); |
2278 | 0 | } |
2279 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) |
2280 | | |
2281 | 0 | void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { |
2282 | 0 | const ggml_tensor * src0 = op->src[0]; |
2283 | 0 | const ggml_tensor * src1 = op->src[1]; |
2284 | 0 | const ggml_tensor * ids = op->src[2]; |
2285 | 0 | ggml_tensor * dst = op; |
2286 | |
|
2287 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
2288 | |
|
2289 | 0 | const int ith = params->ith; |
2290 | 0 | const int nth = params->nth; |
2291 | |
|
2292 | 0 | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
2293 | | |
2294 | | // we don't support permuted src0 or src1 |
2295 | 0 | GGML_ASSERT(nb00 == ggml_type_size(src0->type)); |
2296 | 0 | GGML_ASSERT(nb10 == ggml_type_size(src1->type)); |
2297 | | |
2298 | | // dst cannot be transposed or permuted |
2299 | 0 | GGML_ASSERT(nb0 == sizeof(float)); |
2300 | 0 | GGML_ASSERT(nb0 <= nb1); |
2301 | 0 | GGML_ASSERT(nb1 <= nb2); |
2302 | 0 | GGML_ASSERT(nb2 <= nb3); |
2303 | |
|
2304 | 0 | GGML_ASSERT(ne03 == 1); |
2305 | 0 | GGML_ASSERT(ne13 == 1); |
2306 | 0 | GGML_ASSERT(ne3 == 1); |
2307 | |
|
2308 | 0 | GGML_ASSERT(src1->type == GGML_TYPE_F32); |
2309 | | |
2310 | | // row groups |
2311 | 0 | const int n_ids = ids->ne[0]; // n_expert_used |
2312 | 0 | const int n_as = ne02; // n_expert |
2313 | |
|
2314 | 0 | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
2315 | 0 | const size_t nbw2 = nbw1*ne11; |
2316 | 0 | const size_t nbw3 = nbw2*ne12; |
2317 | |
|
2318 | 0 | struct mmid_row_mapping { |
2319 | 0 | int32_t i1; |
2320 | 0 | int32_t i2; |
2321 | 0 | }; |
2322 | |
|
2323 | 0 | GGML_ASSERT(params->wsize >= |
2324 | 0 | (GGML_PAD(nbw3, sizeof(int64_t)) + |
2325 | 0 | n_as*(ne12 + 1)*sizeof(mmid_row_mapping)) |
2326 | 0 | ); |
2327 | |
|
2328 | 0 | auto * wdata = (char *)params->wdata; |
2329 | 0 | auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t)); |
2330 | | |
2331 | | // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t) |
2332 | 0 | auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] |
2333 | 0 | struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] |
2334 | | |
2335 | | // src1: float32 => param type |
2336 | 0 | for (int64_t i12 = 0; i12 < ne12; ++i12) { |
2337 | 0 | for (int64_t i11 = ith; i11 < ne11; i11 += nth) { |
2338 | 0 | from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), |
2339 | 0 | (void *) (wdata + i12 * nbw2 + i11 * nbw1), |
2340 | 0 | ne10); |
2341 | 0 | } |
2342 | 0 | } |
2343 | |
|
2344 | 0 | #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] |
2345 | |
|
2346 | 0 | if (ith == 0) { |
2347 | | // initialize matrix_row_counts |
2348 | 0 | memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); |
2349 | | |
2350 | | // group rows by src0 matrix |
2351 | 0 | for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { |
2352 | 0 | for (int32_t id = 0; id < n_ids; ++id) { |
2353 | 0 | const int32_t i02 = |
2354 | 0 | *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); |
2355 | |
|
2356 | 0 | GGML_ASSERT(i02 >= 0 && i02 < n_as); |
2357 | |
|
2358 | 0 | MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; |
2359 | 0 | matrix_row_counts[i02] += 1; |
2360 | 0 | } |
2361 | 0 | } |
2362 | 0 | } |
2363 | |
|
2364 | 0 | ggml_barrier(params->threadpool); |
2365 | | |
2366 | | // compute each matrix multiplication in sequence |
2367 | 0 | for (int cur_a = 0; cur_a < n_as; ++cur_a) { |
2368 | 0 | const int64_t cne1 = matrix_row_counts[cur_a]; |
2369 | |
|
2370 | 0 | if (cne1 == 0) { |
2371 | 0 | continue; |
2372 | 0 | } |
2373 | | |
2374 | 0 | const auto * src0_cur = (const char *) src0->data + cur_a*nb02; |
2375 | | |
2376 | | //const int64_t nr0 = ne01; // src0 rows |
2377 | 0 | const int64_t nr1 = cne1; // src1 rows |
2378 | |
|
2379 | 0 | int64_t src0_cur_start = (ith * ne01) / nth; |
2380 | 0 | int64_t src0_cur_end = ((ith + 1) * ne01) / nth; |
2381 | | |
2382 | | // Align boundaries to NB_COLS - round up to ensure all data is included |
2383 | 0 | src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; |
2384 | 0 | src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; |
2385 | 0 | if (src0_cur_end > ne01) { |
2386 | 0 | src0_cur_end = ne01; |
2387 | 0 | } |
2388 | |
|
2389 | 0 | if (src0_cur_start >= src0_cur_end) { |
2390 | 0 | return; |
2391 | 0 | } |
2392 | | |
2393 | 0 | for (int ir1 = 0; ir1 < nr1; ir1++) { |
2394 | 0 | struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); |
2395 | |
|
2396 | 0 | const int id = row_mapping.i1; // selected expert index |
2397 | |
|
2398 | 0 | const int64_t i11 = id % ne11; |
2399 | 0 | const int64_t i12 = row_mapping.i2; // row index in src1 |
2400 | |
|
2401 | 0 | const int64_t i1 = id; // selected expert index |
2402 | 0 | const int64_t i2 = i12; // row |
2403 | |
|
2404 | 0 | const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); |
2405 | |
|
2406 | 0 | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, |
2407 | 0 | (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, |
2408 | 0 | src0_cur + src0_cur_start * nb01, |
2409 | 0 | src1_col, 1, src0_cur_end - src0_cur_start); |
2410 | 0 | } |
2411 | 0 | } |
2412 | 0 | #undef MMID_MATRIX_ROW |
2413 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) |
2414 | | |
2415 | 0 | int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { |
2416 | 0 | GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), |
2417 | 0 | (int) NB_COLS, (int) INTER_SIZE); |
2418 | 0 | return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size); |
2419 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) |
2420 | | }; |
2421 | | |
2422 | | } // namespace ggml::cpu::repack |
2423 | | |
2424 | 0 | static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) { |
2425 | | |
2426 | | // instance for Q4 |
2427 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0; |
2428 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0; |
2429 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0; |
2430 | | |
2431 | | // instance for Q4_K |
2432 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K; |
2433 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K; |
2434 | | |
2435 | | // instance for Q2 |
2436 | 0 | static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K; |
2437 | | |
2438 | | // instance for IQ4 |
2439 | 0 | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0; |
2440 | 0 | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0; |
2441 | | |
2442 | | // instance for Q8_0 |
2443 | 0 | static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0; |
2444 | 0 | static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0; |
2445 | |
|
2446 | 0 | if (cur->type == GGML_TYPE_Q4_0) { |
2447 | 0 | if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) |
2448 | 0 | || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) { |
2449 | 0 | if (cur->ne[1] % 8 == 0) { |
2450 | 0 | return &q4_0_8x8_q8_0; |
2451 | 0 | } |
2452 | 0 | } |
2453 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { |
2454 | 0 | if (cur->ne[1] % 4 == 0) { |
2455 | 0 | return &q4_0_4x8_q8_0; |
2456 | 0 | } |
2457 | 0 | } |
2458 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
2459 | 0 | if (cur->ne[1] % 4 == 0) { |
2460 | 0 | return &q4_0_4x4_q8_0; |
2461 | 0 | } |
2462 | 0 | } |
2463 | 0 | } else if (cur->type == GGML_TYPE_Q4_K) { |
2464 | 0 | if (ggml_cpu_has_avx2()) { |
2465 | 0 | if (cur->ne[1] % 8 == 0) { |
2466 | 0 | return &q4_K_8x8_q8_K; |
2467 | 0 | } |
2468 | 0 | } |
2469 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { |
2470 | 0 | if (cur->ne[1] % 8 == 0) { |
2471 | 0 | return &q4_K_8x8_q8_K; |
2472 | 0 | } |
2473 | 0 | } |
2474 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
2475 | 0 | if (cur->ne[1] % 8 == 0) { |
2476 | 0 | return &q4_K_8x4_q8_K; |
2477 | 0 | } |
2478 | 0 | } |
2479 | 0 | } else if (cur->type == GGML_TYPE_Q2_K) { |
2480 | 0 | if (ggml_cpu_has_avx512()) { |
2481 | 0 | if (cur->ne[1] % 8 == 0) { |
2482 | 0 | return &q2_K_8x8_q8_K; |
2483 | 0 | } |
2484 | 0 | } |
2485 | 0 | } else if (cur->type == GGML_TYPE_IQ4_NL) { |
2486 | 0 | if (ggml_cpu_has_avx2()) { |
2487 | 0 | if (cur->ne[1] % 8 == 0) { |
2488 | 0 | return &iq4_nl_8x8_q8_0; |
2489 | 0 | } |
2490 | 0 | } |
2491 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
2492 | 0 | if (cur->ne[1] % 4 == 0) { |
2493 | 0 | return &iq4_nl_4x4_q8_0; |
2494 | 0 | } |
2495 | 0 | } |
2496 | 0 | } else if (cur->type == GGML_TYPE_Q8_0) { |
2497 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { |
2498 | 0 | if (cur->ne[1] % 4 == 0) { |
2499 | 0 | return &q8_0_4x8_q8_0; |
2500 | 0 | } |
2501 | 0 | } |
2502 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
2503 | 0 | if (cur->ne[1] % 4 == 0) { |
2504 | 0 | return &q8_0_4x4_q8_0; |
2505 | 0 | } |
2506 | 0 | } |
2507 | 0 | } |
2508 | | |
2509 | 0 | return nullptr; |
2510 | 0 | } |
2511 | | |
2512 | 0 | static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
2513 | 0 | tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor)); |
2514 | |
|
2515 | 0 | GGML_UNUSED(buffer); |
2516 | 0 | return GGML_STATUS_SUCCESS; |
2517 | 0 | } |
2518 | | |
2519 | | static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, |
2520 | 0 | const void * data, size_t offset, size_t size) { |
2521 | 0 | GGML_ASSERT(offset == 0); |
2522 | 0 | GGML_ASSERT(size == ggml_nbytes(tensor)); |
2523 | |
|
2524 | 0 | auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra; |
2525 | 0 | auto OK = tensor_traits->repack(tensor, data, size); |
2526 | |
|
2527 | 0 | GGML_ASSERT(OK == 0); |
2528 | 0 | GGML_UNUSED(buffer); |
2529 | 0 | } |
2530 | | |
2531 | 0 | static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |
2532 | 0 | return "CPU_REPACK"; |
2533 | | |
2534 | 0 | GGML_UNUSED(buft); |
2535 | 0 | } |
2536 | | |
2537 | 0 | static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |
2538 | 0 | ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); |
2539 | |
|
2540 | 0 | if (buffer == nullptr) { |
2541 | 0 | return nullptr; |
2542 | 0 | } |
2543 | | |
2544 | 0 | buffer->buft = buft; |
2545 | 0 | buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor; |
2546 | 0 | buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor; |
2547 | 0 | buffer->iface.get_tensor = nullptr; |
2548 | 0 | buffer->iface.cpy_tensor = nullptr; |
2549 | 0 | return buffer; |
2550 | 0 | } |
2551 | | |
2552 | 0 | static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |
2553 | 0 | return TENSOR_ALIGNMENT; |
2554 | | |
2555 | 0 | GGML_UNUSED(buft); |
2556 | 0 | } |
2557 | | |
2558 | | namespace ggml::cpu::repack { |
2559 | | class extra_buffer_type : ggml::cpu::extra_buffer_type { |
2560 | 0 | bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { |
2561 | 0 | if ( op->op == GGML_OP_MUL_MAT && |
2562 | 0 | op->src[0]->buffer && |
2563 | 0 | (ggml_n_dims(op->src[0]) == 2) && |
2564 | 0 | op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() && |
2565 | 0 | ggml_repack_get_optimal_repack_type(op->src[0]) |
2566 | 0 | ) { |
2567 | 0 | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
2568 | 0 | return false; |
2569 | 0 | } |
2570 | 0 | if (op->src[1]->type == GGML_TYPE_F32) { |
2571 | 0 | return true; |
2572 | 0 | } |
2573 | | //if (op->src[1]->type == GGML_TYPE_Q8_0) { |
2574 | | // return true; |
2575 | | //} |
2576 | | // may be possible if Q8_0 packed... |
2577 | 0 | } else if (op->op == GGML_OP_MUL_MAT_ID |
2578 | 0 | && op->src[0]->buffer |
2579 | 0 | && (ggml_n_dims(op->src[0]) == 3) |
2580 | 0 | && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() |
2581 | 0 | && ggml_repack_get_optimal_repack_type(op->src[0]) |
2582 | 0 | ) { |
2583 | 0 | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
2584 | 0 | return false; |
2585 | 0 | } |
2586 | 0 | if (op->src[1]->type == GGML_TYPE_F32) { |
2587 | 0 | return true; |
2588 | 0 | } |
2589 | | //if (op->src[1]->type == GGML_TYPE_Q8_0) { |
2590 | | // return true; |
2591 | | //} |
2592 | 0 | } |
2593 | 0 | return false; |
2594 | 0 | } |
2595 | | |
2596 | 0 | ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { |
2597 | 0 | if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { |
2598 | 0 | if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { |
2599 | 0 | return (ggml::cpu::tensor_traits *) op->src[0]->extra; |
2600 | 0 | } |
2601 | 0 | } |
2602 | 0 | return nullptr; |
2603 | 0 | } |
2604 | | }; |
2605 | | } // namespace ggml::cpu::repack |
2606 | | |
2607 | 0 | ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) { |
2608 | 0 | static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = { |
2609 | 0 | /* .iface = */ { |
2610 | 0 | /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name, |
2611 | 0 | /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer, |
2612 | 0 | /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment, |
2613 | 0 | /* .get_max_size = */ nullptr, // defaults to SIZE_MAX |
2614 | 0 | /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes |
2615 | 0 | /* .is_host = */ nullptr, |
2616 | 0 | }, |
2617 | 0 | /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
2618 | 0 | /* .context = */ new ggml::cpu::repack::extra_buffer_type(), |
2619 | 0 | }; |
2620 | |
|
2621 | 0 | return &ggml_backend_cpu_buffer_type_repack; |
2622 | 0 | } |