/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp
Line | Count | Source |
1 | | #define GGML_COMMON_IMPL_CPP |
2 | | #define GGML_COMMON_DECL_CPP |
3 | | #include "ggml-common.h" |
4 | | #include "ggml-backend-impl.h" |
5 | | |
6 | | #include "ggml-impl.h" |
7 | | #include "ggml-cpu.h" |
8 | | #include "ggml-cpu-impl.h" |
9 | | #include "simd-mappings.h" |
10 | | #include "traits.h" |
11 | | |
12 | | #include "arch-fallback.h" |
13 | | |
14 | | #include <cmath> |
15 | | #include <cstring> |
16 | | #include <cassert> |
17 | | #include <cstdio> // for GGML_ASSERT |
18 | | |
19 | | #include "repack.h" |
20 | | |
21 | | #if defined(__GNUC__) |
22 | | #pragma GCC diagnostic ignored "-Woverlength-strings" |
23 | | #endif |
24 | | |
25 | 0 | #define UNUSED GGML_UNUSED |
26 | | |
27 | 0 | static inline int nearest_int(float fval) { |
28 | 0 | assert(fabsf(fval) <= 4194303.f); |
29 | 0 | float val = fval + 12582912.f; |
30 | 0 | int i; memcpy(&i, &val, sizeof(int)); |
31 | 0 | return (i & 0x007fffff) - 0x00400000; |
32 | 0 | } |
33 | | |
34 | | // Functions to create the interleaved data layout formats |
35 | | |
36 | | // interleave 4 block_q4_0s in blocks of blck_size_interleave |
37 | | // returns an interleaved block_q4_0x4 |
38 | | // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks |
39 | | // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave |
40 | | // |
41 | | // - in : an array of block_q4_0 pointers |
42 | | // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of |
43 | | // blck_size_interleave bytes |
44 | | // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes |
45 | | // from bias offset form to pure sign form (this saves subtract |
46 | | // operations durin unpacking) |
47 | | // |
48 | | |
49 | | extern "C" { |
50 | | |
51 | 0 | void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
52 | 0 | assert(QK8_0 == 32); |
53 | 0 | assert(k % QK8_0 == 0); |
54 | 0 | const int nb = k / QK8_0; |
55 | |
|
56 | 0 | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
57 | | |
58 | | // scalar |
59 | 0 | const int blck_size_interleave = 4; |
60 | 0 | float srcv[4][QK8_0]; |
61 | 0 | float id[4]; |
62 | |
|
63 | 0 | for (int i = 0; i < nb; i++) { |
64 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
65 | 0 | float amax = 0.0f; // absolute max |
66 | |
|
67 | 0 | for (int j = 0; j < QK8_0; j++) { |
68 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
69 | 0 | amax = MAX(amax, fabsf(srcv[row_iter][j])); |
70 | 0 | } |
71 | |
|
72 | 0 | const float d = amax / ((1 << 7) - 1); |
73 | 0 | id[row_iter] = d ? 1.0f / d : 0.0f; |
74 | |
|
75 | 0 | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
76 | 0 | } |
77 | |
|
78 | 0 | for (int j = 0; j < QK8_0 * 4; j++) { |
79 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
80 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
81 | 0 | src_offset += (j % blck_size_interleave); |
82 | |
|
83 | 0 | float x0 = srcv[src_id][src_offset] * id[src_id]; |
84 | 0 | y[i].qs[j] = roundf(x0); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | 0 | void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
90 | 0 | assert(QK8_0 == 32); |
91 | 0 | assert(k % QK8_0 == 0); |
92 | 0 | const int nb = k / QK8_0; |
93 | |
|
94 | 0 | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
95 | | |
96 | | // scalar |
97 | 0 | const int blck_size_interleave = 8; |
98 | 0 | float srcv[4][QK8_0]; |
99 | 0 | float id[4]; |
100 | |
|
101 | 0 | for (int i = 0; i < nb; i++) { |
102 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
103 | 0 | float amax = 0.0f; // absolute max |
104 | |
|
105 | 0 | for (int j = 0; j < QK8_0; j++) { |
106 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
107 | 0 | amax = MAX(amax, fabsf(srcv[row_iter][j])); |
108 | 0 | } |
109 | |
|
110 | 0 | const float d = amax / ((1 << 7) - 1); |
111 | 0 | id[row_iter] = d ? 1.0f / d : 0.0f; |
112 | |
|
113 | 0 | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
114 | 0 | } |
115 | |
|
116 | 0 | for (int j = 0; j < QK8_0 * 4; j++) { |
117 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
118 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
119 | 0 | src_offset += (j % blck_size_interleave); |
120 | |
|
121 | 0 | float x0 = srcv[src_id][src_offset] * id[src_id]; |
122 | 0 | y[i].qs[j] = roundf(x0); |
123 | 0 | } |
124 | 0 | } |
125 | 0 | } |
126 | | |
127 | 0 | void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
128 | 0 | assert(QK_K == 256); |
129 | 0 | assert(k % QK_K == 0); |
130 | 0 | const int nb = k / QK_K; |
131 | |
|
132 | 0 | block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; |
133 | | |
134 | | // scalar |
135 | 0 | const int blck_size_interleave = 8; |
136 | 0 | float srcv[4][QK_K]; |
137 | 0 | float iscale[4]; |
138 | |
|
139 | 0 | for (int i = 0; i < nb; i++) { |
140 | 0 | for (int row_iter = 0; row_iter < 4; row_iter++) { |
141 | 0 | float amax = 0.0f; // absolute max |
142 | 0 | float max = 0; |
143 | |
|
144 | 0 | for (int j = 0; j < QK_K; j++) { |
145 | 0 | srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; |
146 | | // Update the maximum value of the corresponding super block |
147 | 0 | if(amax < fabsf(srcv[row_iter][j])) { |
148 | 0 | amax = fabsf(srcv[row_iter][j]); |
149 | 0 | max = srcv[row_iter][j]; |
150 | 0 | } |
151 | 0 | } |
152 | |
|
153 | 0 | iscale[row_iter] = amax ? -127.f/max : 0; |
154 | |
|
155 | 0 | y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0; |
156 | 0 | } |
157 | |
|
158 | 0 | for (int j = 0; j < QK_K / 4; j++) { |
159 | 0 | y[i].bsums[j] = 0; |
160 | 0 | } |
161 | | |
162 | | // Quants values are interleaved in sequence of eight bytes from corresponding super blocks |
163 | | // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving |
164 | | // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on |
165 | 0 | for (int j = 0; j < QK_K * 4; j++) { |
166 | 0 | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
167 | 0 | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
168 | 0 | src_offset += (j % blck_size_interleave); |
169 | 0 | int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3); |
170 | |
|
171 | 0 | float x0 = srcv[src_id][src_offset] * iscale[src_id]; |
172 | 0 | y[i].qs[j] = nearest_int(x0); |
173 | 0 | y[i].bsums[index] += y[i].qs[j]; |
174 | 0 | } |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | | } // extern "C" |
179 | | |
180 | | template <int64_t INTER_SIZE, ggml_type PARAM_TYPE> |
181 | | void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); |
182 | | |
183 | 0 | template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
184 | 0 | assert(nrow == 4); |
185 | 0 | UNUSED(nrow); |
186 | 0 | ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); |
187 | 0 | } |
188 | | |
189 | 0 | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
190 | 0 | assert(nrow == 4); |
191 | 0 | UNUSED(nrow); |
192 | 0 | ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); |
193 | 0 | } |
194 | | |
195 | 0 | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
196 | 0 | assert(nrow == 4); |
197 | 0 | UNUSED(nrow); |
198 | 0 | ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); |
199 | 0 | } |
200 | | |
201 | | extern "C" { |
202 | | |
203 | 0 | void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
204 | 0 | const int qk = QK8_0; |
205 | 0 | const int nb = n / qk; |
206 | 0 | const int ncols_interleaved = 4; |
207 | 0 | const int blocklen = 4; |
208 | |
|
209 | 0 | assert(nr == 1); |
210 | 0 | assert(n % qk == 0); |
211 | 0 | assert(nc % ncols_interleaved == 0); |
212 | |
|
213 | 0 | UNUSED(s); |
214 | 0 | UNUSED(bs); |
215 | 0 | UNUSED(vx); |
216 | 0 | UNUSED(vy); |
217 | 0 | UNUSED(nr); |
218 | 0 | UNUSED(nc); |
219 | 0 | UNUSED(nb); |
220 | 0 | UNUSED(ncols_interleaved); |
221 | 0 | UNUSED(blocklen); |
222 | |
|
223 | 0 | float sumf[4]; |
224 | 0 | int sumi; |
225 | |
|
226 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
227 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
228 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
229 | |
|
230 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
231 | 0 | for (int l = 0; l < nb; l++) { |
232 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
233 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
234 | 0 | sumi = 0; |
235 | 0 | for (int i = 0; i < blocklen; ++i) { |
236 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
237 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
238 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
239 | 0 | } |
240 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
241 | 0 | } |
242 | 0 | } |
243 | 0 | } |
244 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | 0 | void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
249 | 0 | const int qk = QK8_0; |
250 | 0 | const int nb = n / qk; |
251 | 0 | const int ncols_interleaved = 4; |
252 | 0 | const int blocklen = 8; |
253 | |
|
254 | 0 | assert (n % qk == 0); |
255 | 0 | assert (nc % ncols_interleaved == 0); |
256 | |
|
257 | 0 | UNUSED(s); |
258 | 0 | UNUSED(bs); |
259 | 0 | UNUSED(vx); |
260 | 0 | UNUSED(vy); |
261 | 0 | UNUSED(nr); |
262 | 0 | UNUSED(nc); |
263 | 0 | UNUSED(nb); |
264 | 0 | UNUSED(ncols_interleaved); |
265 | 0 | UNUSED(blocklen); |
266 | |
|
267 | 0 | float sumf[4]; |
268 | 0 | int sumi; |
269 | |
|
270 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
271 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
272 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
273 | |
|
274 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
275 | 0 | for (int l = 0; l < nb; l++) { |
276 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
277 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
278 | 0 | sumi = 0; |
279 | 0 | for (int i = 0; i < blocklen; ++i) { |
280 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
281 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
282 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
283 | 0 | } |
284 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
285 | 0 | } |
286 | 0 | } |
287 | 0 | } |
288 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | 0 | void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
293 | 0 | const int qk = QK8_0; |
294 | 0 | const int nb = n / qk; |
295 | 0 | const int ncols_interleaved = 8; |
296 | 0 | const int blocklen = 8; |
297 | |
|
298 | 0 | assert (n % qk == 0); |
299 | 0 | assert (nc % ncols_interleaved == 0); |
300 | |
|
301 | 0 | UNUSED(s); |
302 | 0 | UNUSED(bs); |
303 | 0 | UNUSED(vx); |
304 | 0 | UNUSED(vy); |
305 | 0 | UNUSED(nr); |
306 | 0 | UNUSED(nc); |
307 | 0 | UNUSED(nb); |
308 | 0 | UNUSED(ncols_interleaved); |
309 | 0 | UNUSED(blocklen); |
310 | |
|
311 | 0 | float sumf[8]; |
312 | 0 | int sumi; |
313 | |
|
314 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
315 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
316 | 0 | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
317 | |
|
318 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
319 | 0 | for (int l = 0; l < nb; l++) { |
320 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
321 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
322 | 0 | sumi = 0; |
323 | 0 | for (int i = 0; i < blocklen; ++i) { |
324 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
325 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
326 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
327 | 0 | } |
328 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
329 | 0 | } |
330 | 0 | } |
331 | 0 | } |
332 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
333 | 0 | } |
334 | 0 | } |
335 | | |
336 | 0 | void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
337 | 0 | const int qk = QK_K; |
338 | 0 | const int nb = n / qk; |
339 | 0 | const int ncols_interleaved = 8; |
340 | 0 | const int blocklen = 8; |
341 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
342 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
343 | 0 | static const uint32_t kmask3 = 0x03030303; |
344 | |
|
345 | 0 | assert (n % qk == 0); |
346 | 0 | assert (nc % ncols_interleaved == 0); |
347 | |
|
348 | 0 | UNUSED(s); |
349 | 0 | UNUSED(bs); |
350 | 0 | UNUSED(vx); |
351 | 0 | UNUSED(vy); |
352 | 0 | UNUSED(nr); |
353 | 0 | UNUSED(nc); |
354 | 0 | UNUSED(nb); |
355 | 0 | UNUSED(ncols_interleaved); |
356 | 0 | UNUSED(blocklen); |
357 | |
|
358 | 0 | float sumf[8]; |
359 | 0 | float sum_minf[8]; |
360 | 0 | uint32_t utmp[32]; |
361 | 0 | int sumi1; |
362 | 0 | int sumi2; |
363 | 0 | int sumi; |
364 | |
|
365 | 0 | const block_q8_K * a_ptr = (const block_q8_K *) vy; |
366 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
367 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
368 | |
|
369 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
370 | 0 | sumf[j] = 0.0; |
371 | 0 | sum_minf[j] = 0.0; |
372 | 0 | } |
373 | 0 | for (int l = 0; l < nb; l++) { |
374 | 0 | for (int sb = 0; sb < 8; sb++) { |
375 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
376 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
377 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
378 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
379 | 0 | utmp[sb * 4 + 2] = uaux_0; |
380 | 0 | utmp[sb * 4 + 0] &= kmask1; |
381 | 0 | } |
382 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
383 | 0 | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
384 | 0 | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
385 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
386 | 0 | sumi1 = 0; |
387 | 0 | sumi2 = 0; |
388 | 0 | sumi = 0; |
389 | 0 | for (int i = 0; i < blocklen; ++i) { |
390 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
391 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
392 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]); |
393 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]); |
394 | 0 | sumi1 = sumi1 * scales_0[j]; |
395 | 0 | sumi2 = sumi2 * scales_1[j]; |
396 | 0 | sumi += sumi1 + sumi2; |
397 | 0 | } |
398 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
399 | 0 | } |
400 | 0 | } |
401 | 0 | for (int sb = 0; sb < 8; sb++) { |
402 | 0 | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
403 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
404 | 0 | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
405 | 0 | } |
406 | 0 | } |
407 | 0 | } |
408 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
409 | 0 | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
410 | 0 | } |
411 | 0 | } |
412 | 0 | } |
413 | | |
414 | 0 | void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
415 | 0 | const int qk = QK_K; |
416 | 0 | const int nb = n / qk; |
417 | 0 | const int ncols_interleaved = 8; |
418 | 0 | const int blocklen = 8; |
419 | |
|
420 | 0 | assert (n % qk == 0); |
421 | 0 | assert (nc % ncols_interleaved == 0); |
422 | |
|
423 | 0 | UNUSED(s); |
424 | 0 | UNUSED(bs); |
425 | 0 | UNUSED(vx); |
426 | 0 | UNUSED(vy); |
427 | 0 | UNUSED(nr); |
428 | 0 | UNUSED(nc); |
429 | 0 | UNUSED(nb); |
430 | 0 | UNUSED(ncols_interleaved); |
431 | 0 | UNUSED(blocklen); |
432 | |
|
433 | 0 | float sumf[8]; |
434 | 0 | float sum_minf[8]; |
435 | 0 | int sumi1,sumi2,sumi3,sumi4; |
436 | 0 | int sumi; |
437 | |
|
438 | 0 | const block_q8_K * a_ptr = (const block_q8_K *)vy; |
439 | 0 | for(int x = 0; x < nc / ncols_interleaved; x++) { |
440 | 0 | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); |
441 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
442 | 0 | sumf[j] = 0.0; |
443 | 0 | sum_minf[j] = 0.0; |
444 | 0 | } |
445 | 0 | for (int l = 0; l < nb; l++) { |
446 | 0 | for (int k = 0; k < (qk / (4 * blocklen)); k++) { |
447 | 0 | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; |
448 | 0 | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; |
449 | 0 | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; |
450 | 0 | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; |
451 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
452 | 0 | sumi1 = 0; |
453 | 0 | sumi2 = 0; |
454 | 0 | sumi3 = 0; |
455 | 0 | sumi4 = 0; |
456 | 0 | sumi = 0; |
457 | 0 | int offset = ((k / 2) % 2) + j * 2; |
458 | 0 | for (int i = 0; i < blocklen; ++i){ |
459 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3); |
460 | 0 | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3); |
461 | 0 | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3); |
462 | 0 | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3); |
463 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); |
464 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); |
465 | 0 | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]); |
466 | 0 | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]); |
467 | |
|
468 | 0 | sumi1 = sumi1 * (scales_0[offset] & 0xF); |
469 | 0 | sumi2 = sumi2 * (scales_1[offset] & 0xF); |
470 | 0 | sumi3 = sumi3 * (scales_2[offset] & 0xF); |
471 | 0 | sumi4 = sumi4 * (scales_3[offset] & 0xF); |
472 | 0 | sumi += sumi1 + sumi2 + sumi3 + sumi4; |
473 | 0 | } |
474 | 0 | sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
475 | 0 | } |
476 | 0 | } |
477 | 0 | for(int sb = 0; sb < 8; sb++) { |
478 | 0 | const uint8_t *mins = b_ptr[l].scales + sb * 16; |
479 | 0 | for(int j = 0; j < ncols_interleaved; j++){ |
480 | 0 | sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
481 | 0 | } |
482 | 0 | } |
483 | 0 | } |
484 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
485 | 0 | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
486 | 0 | } |
487 | 0 | } |
488 | 0 | } |
489 | | |
490 | 0 | void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
491 | 0 | const int qk = QK8_0; |
492 | 0 | const int nb = n / qk; |
493 | 0 | const int ncols_interleaved = 4; |
494 | 0 | const int blocklen = 4; |
495 | |
|
496 | 0 | assert(nr == 1); |
497 | 0 | assert(n % qk == 0); |
498 | 0 | assert(nc % ncols_interleaved == 0); |
499 | |
|
500 | 0 | UNUSED(bs); |
501 | 0 | UNUSED(nr); |
502 | |
|
503 | 0 | float sumf[4]; |
504 | 0 | int sumi; |
505 | |
|
506 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
507 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
508 | 0 | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
509 | |
|
510 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
511 | 0 | for (int l = 0; l < nb; l++) { |
512 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
513 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
514 | 0 | sumi = 0; |
515 | 0 | for (int i = 0; i < blocklen; ++i) { |
516 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
517 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
518 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); |
519 | 0 | } |
520 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
521 | 0 | } |
522 | 0 | } |
523 | 0 | } |
524 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
525 | 0 | } |
526 | 0 | } |
527 | | |
528 | 0 | void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
529 | 0 | const int qk = QK8_0; |
530 | 0 | const int nb = n / qk; |
531 | 0 | const int ncols_interleaved = 8; |
532 | 0 | const int blocklen = 8; |
533 | |
|
534 | 0 | assert(nr == 1); |
535 | 0 | assert(n % qk == 0); |
536 | 0 | assert(nc % ncols_interleaved == 0); |
537 | |
|
538 | 0 | UNUSED(bs); |
539 | 0 | UNUSED(nr); |
540 | |
|
541 | 0 | float sumf[8]; |
542 | 0 | int sumi; |
543 | |
|
544 | 0 | const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
545 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
546 | 0 | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb); |
547 | |
|
548 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
549 | 0 | for (int l = 0; l < nb; l++) { |
550 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
551 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
552 | 0 | sumi = 0; |
553 | 0 | for (int i = 0; i < blocklen; ++i) { |
554 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
555 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
556 | 0 | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); |
557 | 0 | } |
558 | 0 | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
559 | 0 | } |
560 | 0 | } |
561 | 0 | } |
562 | 0 | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
563 | 0 | } |
564 | 0 | } |
565 | | |
566 | 0 | void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
567 | 0 | const int qk = QK8_0; |
568 | 0 | const int nb = n / qk; |
569 | 0 | const int ncols_interleaved = 4; |
570 | 0 | const int blocklen = 4; |
571 | |
|
572 | 0 | assert (n % qk == 0); |
573 | 0 | assert (nr % 4 == 0); |
574 | 0 | assert (nc % ncols_interleaved == 0); |
575 | |
|
576 | 0 | UNUSED(s); |
577 | 0 | UNUSED(bs); |
578 | 0 | UNUSED(vx); |
579 | 0 | UNUSED(vy); |
580 | 0 | UNUSED(nr); |
581 | 0 | UNUSED(nc); |
582 | 0 | UNUSED(nb); |
583 | 0 | UNUSED(ncols_interleaved); |
584 | 0 | UNUSED(blocklen); |
585 | |
|
586 | 0 | { |
587 | 0 | float sumf[4][4]; |
588 | 0 | int sumi; |
589 | |
|
590 | 0 | for (int y = 0; y < nr / 4; y++) { |
591 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
592 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
593 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
594 | 0 | for (int m = 0; m < 4; m++) { |
595 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
596 | 0 | } |
597 | 0 | for (int l = 0; l < nb; l++) { |
598 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
599 | 0 | for (int m = 0; m < 4; m++) { |
600 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
601 | 0 | sumi = 0; |
602 | 0 | for (int i = 0; i < blocklen; ++i) { |
603 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
604 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
605 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
606 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
607 | 0 | } |
608 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
609 | 0 | } |
610 | 0 | } |
611 | 0 | } |
612 | 0 | } |
613 | 0 | for (int m = 0; m < 4; m++) { |
614 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
615 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
616 | 0 | } |
617 | 0 | } |
618 | 0 | } |
619 | 0 | } |
620 | 0 | } |
621 | | |
622 | 0 | void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
623 | 0 | const int qk = QK8_0; |
624 | 0 | const int nb = n / qk; |
625 | 0 | const int ncols_interleaved = 4; |
626 | 0 | const int blocklen = 8; |
627 | |
|
628 | 0 | assert (n % qk == 0); |
629 | 0 | assert (nr % 4 == 0); |
630 | 0 | assert (nc % ncols_interleaved == 0); |
631 | |
|
632 | 0 | UNUSED(s); |
633 | 0 | UNUSED(bs); |
634 | 0 | UNUSED(vx); |
635 | 0 | UNUSED(vy); |
636 | 0 | UNUSED(nr); |
637 | 0 | UNUSED(nc); |
638 | 0 | UNUSED(nb); |
639 | 0 | UNUSED(ncols_interleaved); |
640 | 0 | UNUSED(blocklen); |
641 | |
|
642 | 0 | float sumf[4][4]; |
643 | 0 | int sumi; |
644 | |
|
645 | 0 | for (int y = 0; y < nr / 4; y++) { |
646 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
647 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
648 | 0 | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
649 | 0 | for (int m = 0; m < 4; m++) { |
650 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
651 | 0 | } |
652 | 0 | for (int l = 0; l < nb; l++) { |
653 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
654 | 0 | for (int m = 0; m < 4; m++) { |
655 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
656 | 0 | sumi = 0; |
657 | 0 | for (int i = 0; i < blocklen; ++i) { |
658 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
659 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
660 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
661 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
662 | 0 | } |
663 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
664 | 0 | } |
665 | 0 | } |
666 | 0 | } |
667 | 0 | } |
668 | 0 | for (int m = 0; m < 4; m++) { |
669 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
670 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
671 | 0 | } |
672 | 0 | } |
673 | 0 | } |
674 | 0 | } |
675 | | |
676 | 0 | void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
677 | 0 | const int qk = QK8_0; |
678 | 0 | const int nb = n / qk; |
679 | 0 | const int ncols_interleaved = 8; |
680 | 0 | const int blocklen = 8; |
681 | |
|
682 | 0 | assert (n % qk == 0); |
683 | 0 | assert (nr % 4 == 0); |
684 | 0 | assert (nc % ncols_interleaved == 0); |
685 | |
|
686 | 0 | UNUSED(s); |
687 | 0 | UNUSED(bs); |
688 | 0 | UNUSED(vx); |
689 | 0 | UNUSED(vy); |
690 | 0 | UNUSED(nr); |
691 | 0 | UNUSED(nc); |
692 | 0 | UNUSED(nb); |
693 | 0 | UNUSED(ncols_interleaved); |
694 | 0 | UNUSED(blocklen); |
695 | |
|
696 | 0 | float sumf[4][8]; |
697 | 0 | int sumi; |
698 | |
|
699 | 0 | for (int y = 0; y < nr / 4; y++) { |
700 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
701 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
702 | 0 | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
703 | 0 | for (int m = 0; m < 4; m++) { |
704 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
705 | 0 | } |
706 | 0 | for (int l = 0; l < nb; l++) { |
707 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
708 | 0 | for (int m = 0; m < 4; m++) { |
709 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
710 | 0 | sumi = 0; |
711 | 0 | for (int i = 0; i < blocklen; ++i) { |
712 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
713 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
714 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
715 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
716 | 0 | } |
717 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
718 | 0 | } |
719 | 0 | } |
720 | 0 | } |
721 | 0 | } |
722 | 0 | for (int m = 0; m < 4; m++) { |
723 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
724 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
725 | 0 | } |
726 | 0 | } |
727 | 0 | } |
728 | 0 | } |
729 | | |
730 | 0 | void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
731 | 0 | const int qk = QK_K; |
732 | 0 | const int nb = n / qk; |
733 | 0 | const int ncols_interleaved = 8; |
734 | 0 | const int blocklen = 8; |
735 | 0 | static const uint32_t kmask1 = 0x3f3f3f3f; |
736 | 0 | static const uint32_t kmask2 = 0x0f0f0f0f; |
737 | 0 | static const uint32_t kmask3 = 0x03030303; |
738 | |
|
739 | 0 | assert (n % qk == 0); |
740 | 0 | assert (nr % 4 == 0); |
741 | 0 | assert (nc % ncols_interleaved == 0); |
742 | |
|
743 | 0 | UNUSED(s); |
744 | 0 | UNUSED(bs); |
745 | 0 | UNUSED(vx); |
746 | 0 | UNUSED(vy); |
747 | 0 | UNUSED(nr); |
748 | 0 | UNUSED(nc); |
749 | 0 | UNUSED(nb); |
750 | 0 | UNUSED(ncols_interleaved); |
751 | 0 | UNUSED(blocklen); |
752 | |
|
753 | 0 | float sumf[4][8]; |
754 | 0 | float sum_minf[4][8]; |
755 | 0 | uint32_t utmp[32]; |
756 | 0 | int sumi1; |
757 | 0 | int sumi2; |
758 | 0 | int sumi; |
759 | |
|
760 | 0 | for (int y = 0; y < nr / 4; y++) { |
761 | 0 | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
762 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
763 | 0 | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
764 | 0 | for (int m = 0; m < 4; m++) { |
765 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
766 | 0 | sumf[m][j] = 0.0; |
767 | 0 | sum_minf[m][j] = 0.0; |
768 | 0 | } |
769 | 0 | } |
770 | 0 | for (int l = 0; l < nb; l++) { |
771 | 0 | for (int sb = 0; sb < 8; sb++) { |
772 | 0 | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
773 | 0 | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
774 | 0 | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
775 | 0 | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
776 | 0 | utmp[sb * 4 + 2] = uaux_0; |
777 | 0 | utmp[sb * 4 + 0] &= kmask1; |
778 | 0 | } |
779 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
780 | 0 | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
781 | 0 | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
782 | 0 | for (int m = 0; m < 4; m++) { |
783 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
784 | 0 | sumi1 = 0; |
785 | 0 | sumi2 = 0; |
786 | 0 | sumi = 0; |
787 | 0 | for (int i = 0; i < blocklen; ++i) { |
788 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
789 | 0 | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
790 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]); |
791 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); |
792 | 0 | sumi1 = sumi1 * scales_0[j]; |
793 | 0 | sumi2 = sumi2 * scales_1[j]; |
794 | 0 | sumi += sumi1 + sumi2; |
795 | 0 | } |
796 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
797 | 0 | } |
798 | 0 | } |
799 | 0 | } |
800 | 0 | for (int sb = 0; sb < 8; sb++) { |
801 | 0 | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
802 | 0 | for(int m = 0; m < 4; m++) { |
803 | 0 | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
804 | 0 | for(int j = 0; j < ncols_interleaved; j++) { |
805 | 0 | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
806 | 0 | } |
807 | 0 | } |
808 | 0 | } |
809 | 0 | } |
810 | 0 | for (int m = 0; m < 4; m++) { |
811 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
812 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
813 | 0 | } |
814 | 0 | } |
815 | 0 | } |
816 | 0 | } |
817 | 0 | } |
818 | | |
819 | 0 | void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
820 | 0 | const int qk = QK_K; |
821 | 0 | const int nb = n / qk; |
822 | 0 | const int ncols_interleaved = 8; |
823 | 0 | const int blocklen = 8; |
824 | |
|
825 | 0 | assert (n % qk == 0); |
826 | 0 | assert (nr % 4 == 0); |
827 | 0 | assert (nc % ncols_interleaved == 0); |
828 | |
|
829 | 0 | UNUSED(s); |
830 | 0 | UNUSED(bs); |
831 | 0 | UNUSED(vx); |
832 | 0 | UNUSED(vy); |
833 | 0 | UNUSED(nr); |
834 | 0 | UNUSED(nc); |
835 | 0 | UNUSED(nb); |
836 | 0 | UNUSED(ncols_interleaved); |
837 | 0 | UNUSED(blocklen); |
838 | |
|
839 | 0 | float sumf[4][8]; |
840 | 0 | float sum_minf[4][8]; |
841 | 0 | int sumi1, sumi2, sumi3, sumi4; |
842 | 0 | int sumi; |
843 | |
|
844 | 0 | for (int y = 0; y < nr / 4; y++) { |
845 | 0 | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
846 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
847 | 0 | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); |
848 | 0 | for (int m = 0; m < 4; m++) { |
849 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
850 | 0 | sumf[m][j] = 0.0; |
851 | 0 | sum_minf[m][j] = 0.0; |
852 | 0 | } |
853 | 0 | } |
854 | 0 | for (int l = 0; l < nb; l++) { |
855 | 0 | for (int k = 0; k < (qk / (4 * blocklen)); k++) { |
856 | |
|
857 | 0 | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; |
858 | 0 | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; |
859 | 0 | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; |
860 | 0 | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; |
861 | 0 | for (int m = 0; m < 4; m++) { |
862 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
863 | 0 | sumi1 = 0; |
864 | 0 | sumi2 = 0; |
865 | 0 | sumi3 = 0; |
866 | 0 | sumi4 = 0; |
867 | 0 | sumi = 0; |
868 | 0 | int offset = ((k / 2) % 2) + j * 2; |
869 | 0 | for (int i = 0; i < blocklen; ++i){ |
870 | 0 | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3); |
871 | 0 | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3); |
872 | 0 | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3); |
873 | 0 | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3); |
874 | 0 | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); |
875 | 0 | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); |
876 | 0 | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]); |
877 | 0 | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]); |
878 | 0 | sumi1 = sumi1 * (scales_0[offset] & 0xF); |
879 | 0 | sumi2 = sumi2 * (scales_1[offset] & 0xF); |
880 | 0 | sumi3 = sumi3 * (scales_2[offset] & 0xF); |
881 | 0 | sumi4 = sumi4 * (scales_3[offset] & 0xF); |
882 | 0 | sumi += sumi1 + sumi2 + sumi3 + sumi4; |
883 | 0 | } |
884 | 0 | sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | } |
888 | 0 | for(int sb = 0; sb < 8; sb++) { |
889 | 0 | const uint8_t *mins = b_ptr[l].scales + sb * 16; |
890 | 0 | for(int m = 0; m < 4; m++) { |
891 | 0 | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
892 | 0 | for(int j = 0; j < ncols_interleaved; j++) { |
893 | 0 | int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]); |
894 | 0 | sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
895 | 0 | } |
896 | 0 | } |
897 | 0 | } |
898 | 0 | } |
899 | |
|
900 | 0 | for (int m = 0; m < 4; m++) { |
901 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
902 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
903 | 0 | } |
904 | 0 | } |
905 | 0 | } |
906 | 0 | } |
907 | 0 | } |
908 | | |
909 | | |
910 | 0 | void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
911 | 0 | const int qk = QK8_0; |
912 | 0 | const int nb = n / qk; |
913 | 0 | const int ncols_interleaved = 4; |
914 | 0 | const int blocklen = 4; |
915 | |
|
916 | 0 | assert (n % qk == 0); |
917 | 0 | assert (nr % 4 == 0); |
918 | 0 | assert (nc % ncols_interleaved == 0); |
919 | |
|
920 | 0 | UNUSED(s); |
921 | 0 | UNUSED(bs); |
922 | 0 | UNUSED(vx); |
923 | 0 | UNUSED(vy); |
924 | 0 | UNUSED(nr); |
925 | 0 | UNUSED(nc); |
926 | 0 | UNUSED(nb); |
927 | 0 | UNUSED(ncols_interleaved); |
928 | 0 | UNUSED(blocklen); |
929 | |
|
930 | 0 | { |
931 | 0 | float sumf[4][4]; |
932 | 0 | int sumi; |
933 | |
|
934 | 0 | for (int y = 0; y < nr / 4; y++) { |
935 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
936 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
937 | 0 | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
938 | 0 | for (int m = 0; m < 4; m++) { |
939 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
940 | 0 | } |
941 | 0 | for (int l = 0; l < nb; l++) { |
942 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
943 | 0 | for (int m = 0; m < 4; m++) { |
944 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
945 | 0 | sumi = 0; |
946 | 0 | for (int i = 0; i < blocklen; ++i) { |
947 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
948 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
949 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
950 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); |
951 | 0 | } |
952 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
953 | 0 | } |
954 | 0 | } |
955 | 0 | } |
956 | 0 | } |
957 | 0 | for (int m = 0; m < 4; m++) { |
958 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
959 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
960 | 0 | } |
961 | 0 | } |
962 | 0 | } |
963 | 0 | } |
964 | 0 | } |
965 | | |
966 | 0 | void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
967 | 0 | const int qk = QK8_0; |
968 | 0 | const int nb = n / qk; |
969 | 0 | const int ncols_interleaved = 8; |
970 | 0 | const int blocklen = 8; |
971 | |
|
972 | 0 | assert(n % qk == 0); |
973 | 0 | assert(nr % 4 == 0); |
974 | 0 | assert(nc % ncols_interleaved == 0); |
975 | |
|
976 | 0 | float sumf[4][8]; |
977 | 0 | int sumi; |
978 | |
|
979 | 0 | for (int y = 0; y < nr / 4; y++) { |
980 | 0 | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
981 | 0 | for (int x = 0; x < nc / ncols_interleaved; x++) { |
982 | 0 | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb); |
983 | 0 | for (int m = 0; m < 4; m++) { |
984 | 0 | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
985 | 0 | } |
986 | 0 | for (int l = 0; l < nb; l++) { |
987 | 0 | for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
988 | 0 | for (int m = 0; m < 4; m++) { |
989 | 0 | for (int j = 0; j < ncols_interleaved; j++) { |
990 | 0 | sumi = 0; |
991 | 0 | for (int i = 0; i < blocklen; ++i) { |
992 | 0 | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
993 | 0 | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
994 | 0 | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
995 | 0 | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); |
996 | 0 | } |
997 | 0 | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
998 | 0 | } |
999 | 0 | } |
1000 | 0 | } |
1001 | 0 | } |
1002 | 0 | for (int m = 0; m < 4; m++) { |
1003 | 0 | for (int j = 0; j < ncols_interleaved; j++) |
1004 | 0 | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
1005 | 0 | } |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | } |
1009 | | |
1010 | | } // extern "C" |
1011 | | |
1012 | 0 | static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { |
1013 | 0 | block_q4_0x4 out; |
1014 | |
|
1015 | 0 | for (int i = 0; i < 4; i++) { |
1016 | 0 | out.d[i] = in[i].d; |
1017 | 0 | } |
1018 | |
|
1019 | 0 | const int end = QK4_0 * 2 / blck_size_interleave; |
1020 | |
|
1021 | 0 | if (blck_size_interleave == 8) { |
1022 | 0 | const uint64_t xor_mask = 0x8888888888888888ULL; |
1023 | 0 | for (int i = 0; i < end; ++i) { |
1024 | 0 | int src_id = i % 4; |
1025 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1026 | 0 | int dst_offset = i * blck_size_interleave; |
1027 | |
|
1028 | 0 | uint64_t elems; |
1029 | | // Using memcpy to avoid unaligned memory accesses |
1030 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1031 | 0 | elems ^= xor_mask; |
1032 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1033 | 0 | } |
1034 | 0 | } else if (blck_size_interleave == 4) { |
1035 | 0 | const uint32_t xor_mask = 0x88888888; |
1036 | 0 | for (int i = 0; i < end; ++i) { |
1037 | 0 | int src_id = i % 4; |
1038 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1039 | 0 | int dst_offset = i * blck_size_interleave; |
1040 | |
|
1041 | 0 | uint32_t elems; |
1042 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); |
1043 | 0 | elems ^= xor_mask; |
1044 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); |
1045 | 0 | } |
1046 | 0 | } else { |
1047 | 0 | GGML_ASSERT(false); |
1048 | 0 | } |
1049 | |
|
1050 | 0 | return out; |
1051 | 0 | } |
1052 | | |
1053 | | // interleave 8 block_q4_0s in blocks of blck_size_interleave |
1054 | | // returns an interleaved block_q4_0x8 |
1055 | | // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks |
1056 | | // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave |
1057 | 0 | static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { |
1058 | 0 | block_q4_0x8 out; |
1059 | |
|
1060 | 0 | for (int i = 0; i < 8; i++) { |
1061 | 0 | out.d[i] = in[i].d; |
1062 | 0 | } |
1063 | |
|
1064 | 0 | const int end = QK4_0 * 4 / blck_size_interleave; |
1065 | 0 | const uint64_t xor_mask = 0x8888888888888888ULL; |
1066 | |
|
1067 | 0 | for (int i = 0; i < end; ++i) { |
1068 | 0 | int src_id = i % 8; |
1069 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1070 | 0 | int dst_offset = i * blck_size_interleave; |
1071 | |
|
1072 | 0 | uint64_t elems; |
1073 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1074 | 0 | elems ^= xor_mask; |
1075 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1076 | 0 | } |
1077 | |
|
1078 | 0 | return out; |
1079 | 0 | } |
1080 | | |
1081 | 0 | static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) { |
1082 | 0 | block_q4_Kx8 out; |
1083 | | //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure |
1084 | 0 | for (int i = 0; i < 8; i++) { |
1085 | 0 | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; |
1086 | 0 | } |
1087 | |
|
1088 | 0 | for (int i = 0; i < 8; i++) { |
1089 | 0 | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; |
1090 | 0 | } |
1091 | |
|
1092 | 0 | const int end = QK_K * 4 / blck_size_interleave; |
1093 | | |
1094 | | // Interleave Q4_K quants by taking 8 bytes at a time |
1095 | 0 | for (int i = 0; i < end; ++i) { |
1096 | 0 | int src_id = i % 8; |
1097 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1098 | 0 | int dst_offset = i * blck_size_interleave; |
1099 | |
|
1100 | 0 | uint64_t elems; |
1101 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1102 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1103 | 0 | } |
1104 | | |
1105 | | // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K |
1106 | | // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value) |
1107 | | // The output Q4_Kx8 structure has 96 bytes |
1108 | | // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure |
1109 | | // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures |
1110 | 0 | uint8_t s[8], m[8]; |
1111 | |
|
1112 | 0 | for (int i = 0; i < 4; i++) { |
1113 | 0 | for (int j = 0; j < 8; j++) { |
1114 | 0 | s[j] = in[j].scales[i] & 63; |
1115 | 0 | m[j] = in[j].scales[i + 4] & 63; |
1116 | 0 | } |
1117 | |
|
1118 | 0 | out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2); |
1119 | 0 | out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2); |
1120 | 0 | out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2); |
1121 | 0 | out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2); |
1122 | 0 | out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2); |
1123 | 0 | out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2); |
1124 | 0 | out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2); |
1125 | 0 | out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2); |
1126 | 0 | out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4); |
1127 | 0 | out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4); |
1128 | 0 | out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4); |
1129 | 0 | out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4); |
1130 | |
|
1131 | 0 | } |
1132 | |
|
1133 | 0 | for (int i = 0; i < 4; i++) { |
1134 | 0 | for (int j = 0; j < 8; j++) { |
1135 | 0 | s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15); |
1136 | 0 | m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4); |
1137 | 0 | } |
1138 | |
|
1139 | 0 | out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2); |
1140 | 0 | out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2); |
1141 | 0 | out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2); |
1142 | 0 | out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2); |
1143 | 0 | out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2); |
1144 | 0 | out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2); |
1145 | 0 | out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2); |
1146 | 0 | out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2); |
1147 | 0 | out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4); |
1148 | 0 | out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4); |
1149 | 0 | out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4); |
1150 | 0 | out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4); |
1151 | |
|
1152 | 0 | } |
1153 | |
|
1154 | 0 | return out; |
1155 | 0 | } |
1156 | | |
1157 | 0 | static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) { |
1158 | 0 | block_q2_Kx8 out; |
1159 | | |
1160 | | // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure |
1161 | 0 | for (int i = 0; i < 8; i++) { |
1162 | 0 | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; |
1163 | 0 | } |
1164 | |
|
1165 | 0 | for (int i = 0; i < 8; i++) { |
1166 | 0 | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; |
1167 | 0 | } |
1168 | |
|
1169 | 0 | const int end = QK_K * 2 / blck_size_interleave; |
1170 | | |
1171 | | // Interleave Q2_K quants by taking 8 bytes at a time |
1172 | 0 | for (int i = 0; i < end; ++i) { |
1173 | 0 | int src_id = i % 8; |
1174 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1175 | 0 | int dst_offset = i * blck_size_interleave; |
1176 | |
|
1177 | 0 | uint64_t elems; |
1178 | 0 | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1179 | 0 | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
1180 | 0 | } |
1181 | | |
1182 | | // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K |
1183 | | // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value) |
1184 | | // The output Q2_Kx8 structure has 128 bytes for storing scales and mins |
1185 | | // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure |
1186 | | // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures |
1187 | |
|
1188 | 0 | for(int i = 0; i < 128; i++){ |
1189 | | |
1190 | | // Index for selecting which q2k super block |
1191 | 0 | int src1 = (i % 16) / 2; |
1192 | | // Index for selecting scale |
1193 | 0 | int src2 = ((i / 16) * 2) + (i % 2); |
1194 | |
|
1195 | 0 | out.scales[i] = in[src1].scales[src2]; |
1196 | 0 | } |
1197 | 0 | return out; |
1198 | |
|
1199 | 0 | } |
1200 | | |
1201 | 0 | static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1202 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
1203 | 0 | GGML_ASSERT(interleave_block == 4 || interleave_block == 8); |
1204 | 0 | constexpr int nrows_interleaved = 4; |
1205 | |
|
1206 | 0 | block_q4_0x4 * dst = (block_q4_0x4 *)t->data; |
1207 | 0 | const block_q4_0 * src = (const block_q4_0 *)data; |
1208 | 0 | block_q4_0 dst_tmp[4]; |
1209 | 0 | int nrow = ggml_nrows(t); |
1210 | 0 | int nblocks = t->ne[0] / QK4_0; |
1211 | |
|
1212 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
1213 | |
|
1214 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1215 | 0 | return -1; |
1216 | 0 | } |
1217 | | |
1218 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1219 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1220 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1221 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1222 | 0 | } |
1223 | 0 | *dst++ = make_block_q4_0x4(dst_tmp, interleave_block); |
1224 | 0 | } |
1225 | 0 | src += nrows_interleaved * nblocks; |
1226 | 0 | } |
1227 | 0 | return 0; |
1228 | | |
1229 | 0 | GGML_UNUSED(data_size); |
1230 | 0 | } |
1231 | 0 | static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1232 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_K); |
1233 | 0 | GGML_ASSERT(interleave_block == 8); |
1234 | 0 | constexpr int nrows_interleaved = 8; |
1235 | |
|
1236 | 0 | block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; |
1237 | 0 | const block_q4_K * src = (const block_q4_K*) data; |
1238 | 0 | block_q4_K dst_tmp[8]; |
1239 | 0 | int nrow = ggml_nrows(t); |
1240 | 0 | int nblocks = t->ne[0] / QK_K; |
1241 | |
|
1242 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); |
1243 | |
|
1244 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1245 | 0 | return -1; |
1246 | 0 | } |
1247 | | |
1248 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1249 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1250 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1251 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1252 | 0 | } |
1253 | 0 | *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); |
1254 | 0 | } |
1255 | 0 | src += nrows_interleaved * nblocks; |
1256 | 0 | } |
1257 | 0 | return 0; |
1258 | | |
1259 | 0 | GGML_UNUSED(data_size); |
1260 | 0 | } |
1261 | | |
1262 | 0 | static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1263 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q2_K); |
1264 | 0 | GGML_ASSERT(interleave_block == 8); |
1265 | 0 | constexpr int nrows_interleaved = 8; |
1266 | |
|
1267 | 0 | block_q2_Kx8 * dst = (block_q2_Kx8*)t->data; |
1268 | 0 | const block_q2_K * src = (const block_q2_K*) data; |
1269 | 0 | block_q2_K dst_tmp[8]; |
1270 | 0 | int nrow = ggml_nrows(t); |
1271 | 0 | int nblocks = t->ne[0] / QK_K; |
1272 | |
|
1273 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K)); |
1274 | |
|
1275 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1276 | 0 | return -1; |
1277 | 0 | } |
1278 | | |
1279 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1280 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1281 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1282 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1283 | 0 | } |
1284 | 0 | *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block); |
1285 | 0 | } |
1286 | 0 | src += nrows_interleaved * nblocks; |
1287 | 0 | } |
1288 | 0 | return 0; |
1289 | | |
1290 | 0 | GGML_UNUSED(data_size); |
1291 | 0 | } |
1292 | | |
1293 | 0 | static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1294 | 0 | GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
1295 | 0 | GGML_ASSERT(interleave_block == 8); |
1296 | 0 | constexpr int nrows_interleaved = 8; |
1297 | |
|
1298 | 0 | block_q4_0x8 * dst = (block_q4_0x8*)t->data; |
1299 | 0 | const block_q4_0 * src = (const block_q4_0*) data; |
1300 | 0 | block_q4_0 dst_tmp[8]; |
1301 | 0 | int nrow = ggml_nrows(t); |
1302 | 0 | int nblocks = t->ne[0] / QK4_0; |
1303 | |
|
1304 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
1305 | |
|
1306 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1307 | 0 | return -1; |
1308 | 0 | } |
1309 | | |
1310 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1311 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1312 | 0 | for (int i = 0; i < nrows_interleaved; i++ ) { |
1313 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1314 | 0 | } |
1315 | 0 | *dst++ = make_block_q4_0x8(dst_tmp, interleave_block); |
1316 | 0 | } |
1317 | 0 | src += nrows_interleaved * nblocks; |
1318 | 0 | } |
1319 | 0 | return 0; |
1320 | | |
1321 | 0 | GGML_UNUSED(data_size); |
1322 | 0 | } |
1323 | | |
1324 | 0 | static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { |
1325 | 0 | block_iq4_nlx4 out; |
1326 | |
|
1327 | 0 | for (int i = 0; i < 4; i++) { |
1328 | 0 | out.d[i] = in[i].d; |
1329 | 0 | } |
1330 | |
|
1331 | 0 | const int end = QK4_NL * 2 / blck_size_interleave; |
1332 | | |
1333 | | // TODO: this branch seems wrong |
1334 | | //if (blck_size_interleave == 8) { |
1335 | | // for (int i = 0; i < end; ++i) { |
1336 | | // int src_id = i % 4; |
1337 | | // int src_offset = (i / 4) * blck_size_interleave; |
1338 | | // int dst_offset = i * blck_size_interleave; |
1339 | | |
1340 | | // // Using memcpy to avoid unaligned memory accesses |
1341 | | // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1342 | | // } |
1343 | | //} else |
1344 | 0 | if (blck_size_interleave == 4) { |
1345 | 0 | for (int i = 0; i < end; ++i) { |
1346 | 0 | int src_id = i % 4; |
1347 | 0 | int src_offset = (i / 4) * blck_size_interleave; |
1348 | 0 | int dst_offset = i * blck_size_interleave; |
1349 | |
|
1350 | 0 | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); |
1351 | 0 | } |
1352 | 0 | } else { |
1353 | 0 | GGML_ASSERT(false); |
1354 | 0 | } |
1355 | |
|
1356 | 0 | return out; |
1357 | 0 | } |
1358 | | |
1359 | 0 | static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1360 | 0 | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); |
1361 | 0 | GGML_ASSERT(interleave_block == 4); |
1362 | |
|
1363 | 0 | const block_iq4_nl * src = (const block_iq4_nl *)data; |
1364 | 0 | block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data; |
1365 | |
|
1366 | 0 | block_iq4_nl dst_tmp[4]; |
1367 | |
|
1368 | 0 | int nrow = ggml_nrows(t); |
1369 | 0 | int nrows_interleaved = 4; |
1370 | 0 | int nblocks = t->ne[0] / QK4_NL; |
1371 | |
|
1372 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); |
1373 | |
|
1374 | 0 | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
1375 | 0 | return -1; |
1376 | 0 | } |
1377 | | |
1378 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1379 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1380 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1381 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1382 | 0 | } |
1383 | 0 | *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); |
1384 | 0 | } |
1385 | 0 | src += nrows_interleaved * nblocks; |
1386 | 0 | } |
1387 | 0 | return 0; |
1388 | | |
1389 | 0 | GGML_UNUSED(data_size); |
1390 | 0 | } |
1391 | | |
1392 | 0 | static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) { |
1393 | 0 | block_iq4_nlx8 out; |
1394 | |
|
1395 | 0 | for (int i = 0; i < 8; i++) { |
1396 | 0 | out.d[i] = in[i].d; |
1397 | 0 | } |
1398 | |
|
1399 | 0 | const int end = QK4_NL * 4 / blck_size_interleave; |
1400 | |
|
1401 | 0 | if (blck_size_interleave == 8) { |
1402 | 0 | for (int i = 0; i < end; ++i) { |
1403 | 0 | int src_id = i % 8; |
1404 | 0 | int src_offset = (i / 8) * blck_size_interleave; |
1405 | 0 | int dst_offset = i * blck_size_interleave; |
1406 | |
|
1407 | 0 | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); |
1408 | 0 | } |
1409 | 0 | } else { |
1410 | 0 | GGML_ASSERT(false); |
1411 | 0 | } |
1412 | |
|
1413 | 0 | return out; |
1414 | 0 | } |
1415 | | |
1416 | 0 | static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
1417 | 0 | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); |
1418 | 0 | GGML_ASSERT(interleave_block == 8); |
1419 | |
|
1420 | 0 | const block_iq4_nl * src = (const block_iq4_nl *)data; |
1421 | 0 | block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data; |
1422 | |
|
1423 | 0 | block_iq4_nl dst_tmp[8]; |
1424 | |
|
1425 | 0 | int nrow = ggml_nrows(t); |
1426 | 0 | int nrows_interleaved = 8; |
1427 | 0 | int nblocks = t->ne[0] / QK4_NL; |
1428 | |
|
1429 | 0 | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); |
1430 | |
|
1431 | 0 | if (t->ne[1] % nrows_interleaved != 0) { |
1432 | 0 | return -1; |
1433 | 0 | } |
1434 | | |
1435 | 0 | for (int b = 0; b < nrow; b += nrows_interleaved) { |
1436 | 0 | for (int64_t x = 0; x < nblocks; x++) { |
1437 | 0 | for (int i = 0; i < nrows_interleaved; i++) { |
1438 | 0 | dst_tmp[i] = src[x + i * nblocks]; |
1439 | 0 | } |
1440 | 0 | *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block); |
1441 | 0 | } |
1442 | 0 | src += nrows_interleaved * nblocks; |
1443 | 0 | } |
1444 | 0 | return 0; |
1445 | | |
1446 | 0 | GGML_UNUSED(data_size); |
1447 | 0 | } |
1448 | | |
1449 | | namespace ggml::cpu::repack { |
1450 | | // repack |
1451 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> |
1452 | | int repack(struct ggml_tensor *, const void *, size_t); |
1453 | | |
1454 | | // TODO: generalise. |
1455 | 0 | template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1456 | 0 | return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); |
1457 | 0 | } |
1458 | | |
1459 | 0 | template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1460 | 0 | return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); |
1461 | 0 | } |
1462 | | |
1463 | 0 | template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1464 | 0 | return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); |
1465 | 0 | } |
1466 | | |
1467 | 0 | template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1468 | 0 | return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size); |
1469 | 0 | } |
1470 | | |
1471 | 0 | template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1472 | 0 | return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size); |
1473 | 0 | } |
1474 | | |
1475 | 0 | template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1476 | 0 | return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); |
1477 | 0 | } |
1478 | | |
1479 | | // TODO: needs to be revisited |
1480 | | //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1481 | | // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); |
1482 | | //} |
1483 | | |
1484 | 0 | template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
1485 | 0 | return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size); |
1486 | 0 | } |
1487 | | |
1488 | | // gemv |
1489 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
1490 | | void gemv(int, float *, size_t, const void *, const void *, int, int); |
1491 | | |
1492 | 0 | template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1493 | 0 | ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1494 | 0 | } |
1495 | | |
1496 | 0 | template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1497 | 0 | ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1498 | 0 | } |
1499 | | |
1500 | 0 | template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1501 | 0 | ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1502 | 0 | } |
1503 | | |
1504 | 0 | template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1505 | 0 | ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1506 | 0 | } |
1507 | | |
1508 | 0 | template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1509 | 0 | ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1510 | 0 | } |
1511 | | |
1512 | 0 | template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1513 | 0 | ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1514 | 0 | } |
1515 | | |
1516 | 0 | template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1517 | 0 | ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1518 | 0 | } |
1519 | | |
1520 | | // gemm |
1521 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
1522 | | void gemm(int, float *, size_t, const void *, const void *, int, int); |
1523 | | |
1524 | 0 | template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1525 | 0 | ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1526 | 0 | } |
1527 | | |
1528 | 0 | template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1529 | 0 | ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1530 | 0 | } |
1531 | | |
1532 | 0 | template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1533 | 0 | ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1534 | 0 | } |
1535 | | |
1536 | 0 | template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1537 | 0 | ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1538 | 0 | } |
1539 | | |
1540 | 0 | template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1541 | 0 | ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
1542 | 0 | } |
1543 | | |
1544 | 0 | template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1545 | 0 | ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
1546 | 0 | } |
1547 | | |
1548 | 0 | template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
1549 | 0 | ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
1550 | 0 | } |
1551 | | |
1552 | | class tensor_traits_base : public ggml::cpu::tensor_traits { |
1553 | | public: |
1554 | | virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; |
1555 | | }; |
1556 | | |
1557 | | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base { |
1558 | | |
1559 | 0 | bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { |
1560 | | // not realy a GGML_TYPE_Q8_0 but same size. |
1561 | 0 | switch (op->op) { |
1562 | 0 | case GGML_OP_MUL_MAT: |
1563 | 0 | { |
1564 | 0 | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
1565 | 0 | return true; |
1566 | 0 | } |
1567 | 0 | case GGML_OP_MUL_MAT_ID: |
1568 | 0 | { |
1569 | 0 | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
1570 | 0 | size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. |
1571 | |
|
1572 | 0 | const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert |
1573 | 0 | const int64_t ne12 = op->src[1]->ne[2]; // n_tokens |
1574 | |
|
1575 | 0 | const size_t sizeof_mmid_row_mapping = sizeof(int64_t); |
1576 | |
|
1577 | 0 | size += sizeof_mmid_row_mapping*ne02*(ne12 + 1); |
1578 | |
|
1579 | 0 | return true; |
1580 | 0 | } |
1581 | 0 | default: |
1582 | | // GGML_ABORT("fatal error"); |
1583 | 0 | break; |
1584 | 0 | } |
1585 | 0 | return false; |
1586 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&) |
1587 | | |
1588 | 0 | bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { |
1589 | 0 | switch (op->op) { |
1590 | 0 | case GGML_OP_MUL_MAT: |
1591 | 0 | forward_mul_mat(params, op); |
1592 | 0 | return true; |
1593 | 0 | case GGML_OP_MUL_MAT_ID: |
1594 | 0 | forward_mul_mat_id(params, op); |
1595 | 0 | return true; |
1596 | 0 | default: |
1597 | | // GGML_ABORT("fatal error"); |
1598 | 0 | break; |
1599 | 0 | } |
1600 | 0 | return false; |
1601 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*) |
1602 | | |
1603 | | void forward_mul_mat_one_chunk(ggml_compute_params * params, |
1604 | | ggml_tensor * op, |
1605 | | int64_t src0_start, |
1606 | | int64_t src0_end, |
1607 | | int64_t src1_start, |
1608 | 0 | int64_t src1_end) { |
1609 | 0 | const ggml_tensor * src0 = op->src[0]; |
1610 | 0 | const ggml_tensor * src1 = op->src[1]; |
1611 | 0 | ggml_tensor * dst = op; |
1612 | |
|
1613 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
1614 | |
|
1615 | 0 | const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); |
1616 | |
|
1617 | 0 | GGML_ASSERT(ne03 == 1 && ne13 == 1); |
1618 | 0 | GGML_ASSERT(ne12 % ne02 == 0); |
1619 | 0 | const int64_t r2 = ne12 / ne02; |
1620 | |
|
1621 | 0 | const int64_t i12 = src1_start / ne1; |
1622 | 0 | const int64_t i11 = src1_start - i12 * ne1; |
1623 | | |
1624 | | // Determine batch index |
1625 | 0 | const int64_t i02 = i12 / r2; |
1626 | |
|
1627 | 0 | const int64_t i1 = i11; |
1628 | 0 | const int64_t i2 = i12; |
1629 | |
|
1630 | 0 | const char * src0_ptr = (const char *) src0->data + i02 * nb02; |
1631 | 0 | const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride; |
1632 | 0 | char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2)); |
1633 | |
|
1634 | 0 | const int64_t nrows = src1_end - src1_start; |
1635 | 0 | const int64_t ncols = src0_end - src0_start; |
1636 | |
|
1637 | 0 | GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize); |
1638 | | |
1639 | | // If there are more than three rows in src1, use gemm; otherwise, use gemv. |
1640 | 0 | if (nrows > 3) { |
1641 | 0 | gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0, |
1642 | 0 | src0_ptr + src0_start * nb01, src1_ptr, |
1643 | 0 | nrows - (nrows % 4), ncols); |
1644 | 0 | } |
1645 | 0 | for (int iter = nrows - (nrows % 4); iter < nrows; iter++) { |
1646 | 0 | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start, |
1647 | 0 | ne01, src0_ptr + src0_start * nb01, |
1648 | 0 | src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols); |
1649 | 0 | } |
1650 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long) |
1651 | | |
1652 | 0 | void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { |
1653 | 0 | const ggml_tensor * src0 = op->src[0]; |
1654 | 0 | const ggml_tensor * src1 = op->src[1]; |
1655 | 0 | ggml_tensor * dst = op; |
1656 | |
|
1657 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
1658 | |
|
1659 | 0 | const int ith = params->ith; |
1660 | 0 | const int nth = params->nth; |
1661 | |
|
1662 | 0 | GGML_ASSERT(ne0 == ne01); |
1663 | 0 | GGML_ASSERT(ne1 == ne11); |
1664 | 0 | GGML_ASSERT(ne2 == ne12); |
1665 | 0 | GGML_ASSERT(ne3 == ne13); |
1666 | | |
1667 | | // dst cannot be transposed or permuted |
1668 | 0 | GGML_ASSERT(nb0 == sizeof(float)); |
1669 | 0 | GGML_ASSERT(nb0 <= nb1); |
1670 | 0 | GGML_ASSERT(nb1 <= nb2); |
1671 | 0 | GGML_ASSERT(nb2 <= nb3); |
1672 | | |
1673 | | // TODO: General batched mul mat for 4D tensors |
1674 | | // Currently only supports 3D tensors |
1675 | 0 | GGML_ASSERT(ne03 == 1); |
1676 | 0 | GGML_ASSERT(ne13 == 1); |
1677 | 0 | GGML_ASSERT(ne3 == 1); |
1678 | |
|
1679 | 0 | GGML_ASSERT(src1->type == GGML_TYPE_F32); |
1680 | |
|
1681 | 0 | GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); |
1682 | | // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); |
1683 | |
|
1684 | 0 | char * wdata = static_cast<char *>(params->wdata); |
1685 | 0 | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
1686 | 0 | const size_t nbw2 = nbw1 * ne11; |
1687 | |
|
1688 | 0 | assert(params->wsize >= nbw2 * ne12); |
1689 | |
|
1690 | 0 | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
1691 | | |
1692 | | // INFO: Quantization is done in planes to avoid extra complexity in chunking. |
1693 | | // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how |
1694 | | // the planes are broadcast. |
1695 | 0 | for (int64_t i12 = 0; i12 < ne12; i12++) { |
1696 | 0 | char * data_ptr = (char *) src1->data + i12 * nb12; |
1697 | 0 | char * wdata_ptr = wdata + i12 * nbw2; |
1698 | |
|
1699 | 0 | for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { |
1700 | 0 | ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11), |
1701 | 0 | (void *) (wdata_ptr + i11 * nbw1), 4, ne10); |
1702 | 0 | } |
1703 | |
|
1704 | 0 | const int64_t i11_processed = ne11 - ne11 % 4; |
1705 | 0 | for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { |
1706 | 0 | from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10); |
1707 | 0 | } |
1708 | 0 | } |
1709 | | |
1710 | | // disable for NUMA |
1711 | 0 | const bool disable_chunking = ggml_is_numa(); |
1712 | | |
1713 | | // 4x chunks per thread |
1714 | 0 | const int64_t nr0 = ggml_nrows(op->src[0]); |
1715 | |
|
1716 | 0 | int nth_scaled = nth * 4; |
1717 | 0 | int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled; |
1718 | 0 | int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0; |
1719 | | |
1720 | | // src1 is chunked only by full planes. |
1721 | | // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE |
1722 | | // to route them thorugh GEMV. |
1723 | | // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors |
1724 | | // to avoid affecting their performance |
1725 | 0 | int64_t nchunk1 = ne12; |
1726 | | |
1727 | | // Ensure minimum chunk size to avoid alignment issues with high thread counts |
1728 | | // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment |
1729 | 0 | const int64_t min_chunk_size = NB_COLS; |
1730 | 0 | if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) { |
1731 | 0 | nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size; |
1732 | 0 | } |
1733 | |
|
1734 | 0 | if (nth == 1 || nchunk0 < nth || disable_chunking) { |
1735 | 0 | nchunk0 = nth; |
1736 | 0 | } |
1737 | |
|
1738 | 0 | const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; |
1739 | | |
1740 | | // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size |
1741 | | // This prevents creating too many tiny chunks that could overlap after alignment |
1742 | 0 | const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size; |
1743 | 0 | nchunk0 = MIN(nchunk0, max_nchunk); |
1744 | |
|
1745 | 0 | if (ith == 0) { |
1746 | | // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. |
1747 | 0 | ggml_threadpool_chunk_set(params->threadpool, nth); |
1748 | 0 | } |
1749 | |
|
1750 | 0 | ggml_barrier(params->threadpool); |
1751 | | |
1752 | | // The first chunk comes from our thread_id, the rest will get auto-assigned. |
1753 | 0 | int current_chunk = ith; |
1754 | |
|
1755 | 0 | while (current_chunk < nchunk0 * nchunk1) { |
1756 | 0 | const int64_t ith0 = current_chunk % nchunk0; |
1757 | 0 | const int64_t ith1 = current_chunk / nchunk0; |
1758 | |
|
1759 | 0 | int64_t src0_start = dr0 * ith0; |
1760 | 0 | int64_t src0_end = MIN(src0_start + dr0, nr0); |
1761 | | |
1762 | | // full-plane range for src1 |
1763 | 0 | int64_t src1_start = ith1 * ne11; |
1764 | 0 | int64_t src1_end = (ith1 + 1) * ne11; |
1765 | | |
1766 | | // Align boundaries to NB_COLS - round up to ensure all data is included |
1767 | | // The chunk size limiting above ensures chunks are large enough to prevent overlaps |
1768 | 0 | src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; |
1769 | 0 | src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; |
1770 | 0 | src0_end = MIN(src0_end, ne01); |
1771 | | |
1772 | | // Make sure current plane is the last one before exiting |
1773 | 0 | if (src0_start >= src0_end) { |
1774 | 0 | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); |
1775 | 0 | continue; |
1776 | 0 | } |
1777 | | |
1778 | 0 | forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end); |
1779 | |
|
1780 | 0 | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); |
1781 | 0 | } |
1782 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*) |
1783 | | |
1784 | 0 | void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { |
1785 | 0 | const ggml_tensor * src0 = op->src[0]; |
1786 | 0 | const ggml_tensor * src1 = op->src[1]; |
1787 | 0 | const ggml_tensor * ids = op->src[2]; |
1788 | 0 | ggml_tensor * dst = op; |
1789 | |
|
1790 | 0 | GGML_TENSOR_BINARY_OP_LOCALS |
1791 | |
|
1792 | 0 | const int ith = params->ith; |
1793 | 0 | const int nth = params->nth; |
1794 | |
|
1795 | 0 | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
1796 | | |
1797 | | // we don't support permuted src0 or src1 |
1798 | 0 | GGML_ASSERT(nb00 == ggml_type_size(src0->type)); |
1799 | 0 | GGML_ASSERT(nb10 == ggml_type_size(src1->type)); |
1800 | | |
1801 | | // dst cannot be transposed or permuted |
1802 | 0 | GGML_ASSERT(nb0 == sizeof(float)); |
1803 | 0 | GGML_ASSERT(nb0 <= nb1); |
1804 | 0 | GGML_ASSERT(nb1 <= nb2); |
1805 | 0 | GGML_ASSERT(nb2 <= nb3); |
1806 | |
|
1807 | 0 | GGML_ASSERT(ne03 == 1); |
1808 | 0 | GGML_ASSERT(ne13 == 1); |
1809 | 0 | GGML_ASSERT(ne3 == 1); |
1810 | |
|
1811 | 0 | GGML_ASSERT(src1->type == GGML_TYPE_F32); |
1812 | | |
1813 | | // row groups |
1814 | 0 | const int n_ids = ids->ne[0]; // n_expert_used |
1815 | 0 | const int n_as = ne02; // n_expert |
1816 | |
|
1817 | 0 | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
1818 | 0 | const size_t nbw2 = nbw1*ne11; |
1819 | 0 | const size_t nbw3 = nbw2*ne12; |
1820 | |
|
1821 | 0 | struct mmid_row_mapping { |
1822 | 0 | int32_t i1; |
1823 | 0 | int32_t i2; |
1824 | 0 | }; |
1825 | |
|
1826 | 0 | GGML_ASSERT(params->wsize >= |
1827 | 0 | (GGML_PAD(nbw3, sizeof(int64_t)) + |
1828 | 0 | n_as*(ne12 + 1)*sizeof(mmid_row_mapping)) |
1829 | 0 | ); |
1830 | |
|
1831 | 0 | auto * wdata = (char *)params->wdata; |
1832 | 0 | auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t)); |
1833 | | |
1834 | | // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t) |
1835 | 0 | auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] |
1836 | 0 | struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] |
1837 | | |
1838 | | // src1: float32 => param type |
1839 | 0 | for (int64_t i12 = 0; i12 < ne12; ++i12) { |
1840 | 0 | for (int64_t i11 = ith; i11 < ne11; i11 += nth) { |
1841 | 0 | from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), |
1842 | 0 | (void *) (wdata + i12 * nbw2 + i11 * nbw1), |
1843 | 0 | ne10); |
1844 | 0 | } |
1845 | 0 | } |
1846 | |
|
1847 | 0 | #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] |
1848 | |
|
1849 | 0 | if (ith == 0) { |
1850 | | // initialize matrix_row_counts |
1851 | 0 | memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); |
1852 | | |
1853 | | // group rows by src0 matrix |
1854 | 0 | for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { |
1855 | 0 | for (int32_t id = 0; id < n_ids; ++id) { |
1856 | 0 | const int32_t i02 = |
1857 | 0 | *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); |
1858 | |
|
1859 | 0 | GGML_ASSERT(i02 >= 0 && i02 < n_as); |
1860 | |
|
1861 | 0 | MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; |
1862 | 0 | matrix_row_counts[i02] += 1; |
1863 | 0 | } |
1864 | 0 | } |
1865 | 0 | } |
1866 | |
|
1867 | 0 | ggml_barrier(params->threadpool); |
1868 | | |
1869 | | // compute each matrix multiplication in sequence |
1870 | 0 | for (int cur_a = 0; cur_a < n_as; ++cur_a) { |
1871 | 0 | const int64_t cne1 = matrix_row_counts[cur_a]; |
1872 | |
|
1873 | 0 | if (cne1 == 0) { |
1874 | 0 | continue; |
1875 | 0 | } |
1876 | | |
1877 | 0 | const auto * src0_cur = (const char *) src0->data + cur_a*nb02; |
1878 | | |
1879 | | //const int64_t nr0 = ne01; // src0 rows |
1880 | 0 | const int64_t nr1 = cne1; // src1 rows |
1881 | |
|
1882 | 0 | int64_t src0_cur_start = (ith * ne01) / nth; |
1883 | 0 | int64_t src0_cur_end = ((ith + 1) * ne01) / nth; |
1884 | | |
1885 | | // Align boundaries to NB_COLS - round up to ensure all data is included |
1886 | 0 | src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; |
1887 | 0 | src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; |
1888 | 0 | if (src0_cur_end > ne01) { |
1889 | 0 | src0_cur_end = ne01; |
1890 | 0 | } |
1891 | |
|
1892 | 0 | if (src0_cur_start >= src0_cur_end) { |
1893 | 0 | return; |
1894 | 0 | } |
1895 | | |
1896 | 0 | for (int ir1 = 0; ir1 < nr1; ir1++) { |
1897 | 0 | struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); |
1898 | |
|
1899 | 0 | const int id = row_mapping.i1; // selected expert index |
1900 | |
|
1901 | 0 | const int64_t i11 = id % ne11; |
1902 | 0 | const int64_t i12 = row_mapping.i2; // row index in src1 |
1903 | |
|
1904 | 0 | const int64_t i1 = id; // selected expert index |
1905 | 0 | const int64_t i2 = i12; // row |
1906 | |
|
1907 | 0 | const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); |
1908 | |
|
1909 | 0 | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, |
1910 | 0 | (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, |
1911 | 0 | src0_cur + src0_cur_start * nb01, |
1912 | 0 | src1_col, 1, src0_cur_end - src0_cur_start); |
1913 | 0 | } |
1914 | 0 | } |
1915 | 0 | #undef MMID_MATRIX_ROW |
1916 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*) |
1917 | | |
1918 | 0 | int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { |
1919 | 0 | GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), |
1920 | 0 | (int) NB_COLS, (int) INTER_SIZE); |
1921 | 0 | return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size); |
1922 | 0 | } Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long) |
1923 | | }; |
1924 | | |
1925 | | } // namespace ggml::cpu::repack |
1926 | | |
1927 | 0 | static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) { |
1928 | | |
1929 | | // instance for Q4 |
1930 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0; |
1931 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0; |
1932 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0; |
1933 | 0 | static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K; |
1934 | | |
1935 | | // instance for Q2 |
1936 | 0 | static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K; |
1937 | | |
1938 | | // instance for IQ4 |
1939 | 0 | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0; |
1940 | 0 | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0; |
1941 | |
|
1942 | 0 | if (cur->type == GGML_TYPE_Q4_0) { |
1943 | 0 | if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { |
1944 | 0 | if (cur->ne[1] % 8 == 0) { |
1945 | 0 | return &q4_0_8x8_q8_0; |
1946 | 0 | } |
1947 | 0 | } |
1948 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { |
1949 | 0 | if (cur->ne[1] % 4 == 0) { |
1950 | 0 | return &q4_0_4x8_q8_0; |
1951 | 0 | } |
1952 | 0 | } |
1953 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
1954 | 0 | if (cur->ne[1] % 4 == 0) { |
1955 | 0 | return &q4_0_4x4_q8_0; |
1956 | 0 | } |
1957 | 0 | } |
1958 | 0 | } else if (cur->type == GGML_TYPE_Q4_K) { |
1959 | 0 | if (ggml_cpu_has_avx2()) { |
1960 | 0 | if (cur->ne[1] % 8 == 0) { |
1961 | 0 | return &q4_K_8x8_q8_K; |
1962 | 0 | } |
1963 | 0 | } |
1964 | 0 | } else if (cur->type == GGML_TYPE_Q2_K) { |
1965 | 0 | if (ggml_cpu_has_avx512()) { |
1966 | 0 | if (cur->ne[1] % 8 == 0) { |
1967 | 0 | return &q2_K_8x8_q8_K; |
1968 | 0 | } |
1969 | 0 | } |
1970 | 0 | } else if (cur->type == GGML_TYPE_IQ4_NL) { |
1971 | 0 | if (ggml_cpu_has_avx2()) { |
1972 | 0 | if (cur->ne[1] % 8 == 0) { |
1973 | 0 | return &iq4_nl_8x8_q8_0; |
1974 | 0 | } |
1975 | 0 | } |
1976 | 0 | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
1977 | 0 | if (cur->ne[1] % 4 == 0) { |
1978 | 0 | return &iq4_nl_4x4_q8_0; |
1979 | 0 | } |
1980 | 0 | } |
1981 | 0 | } |
1982 | | |
1983 | 0 | return nullptr; |
1984 | 0 | } |
1985 | | |
1986 | 0 | static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
1987 | 0 | tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor)); |
1988 | |
|
1989 | 0 | GGML_UNUSED(buffer); |
1990 | 0 | return GGML_STATUS_SUCCESS; |
1991 | 0 | } |
1992 | | |
1993 | | static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, |
1994 | 0 | const void * data, size_t offset, size_t size) { |
1995 | 0 | GGML_ASSERT(offset == 0); |
1996 | 0 | GGML_ASSERT(size == ggml_nbytes(tensor)); |
1997 | |
|
1998 | 0 | auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra; |
1999 | 0 | auto OK = tensor_traits->repack(tensor, data, size); |
2000 | |
|
2001 | 0 | GGML_ASSERT(OK == 0); |
2002 | 0 | GGML_UNUSED(buffer); |
2003 | 0 | } |
2004 | | |
2005 | 0 | static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |
2006 | 0 | return "CPU_REPACK"; |
2007 | | |
2008 | 0 | GGML_UNUSED(buft); |
2009 | 0 | } |
2010 | | |
2011 | 0 | static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |
2012 | 0 | ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); |
2013 | |
|
2014 | 0 | if (buffer == nullptr) { |
2015 | 0 | return nullptr; |
2016 | 0 | } |
2017 | | |
2018 | 0 | buffer->buft = buft; |
2019 | 0 | buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor; |
2020 | 0 | buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor; |
2021 | 0 | buffer->iface.get_tensor = nullptr; |
2022 | 0 | buffer->iface.cpy_tensor = nullptr; |
2023 | 0 | return buffer; |
2024 | 0 | } |
2025 | | |
2026 | 0 | static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |
2027 | 0 | return TENSOR_ALIGNMENT; |
2028 | | |
2029 | 0 | GGML_UNUSED(buft); |
2030 | 0 | } |
2031 | | |
2032 | | namespace ggml::cpu::repack { |
2033 | | class extra_buffer_type : ggml::cpu::extra_buffer_type { |
2034 | 0 | bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { |
2035 | 0 | if ( op->op == GGML_OP_MUL_MAT && |
2036 | 0 | op->src[0]->buffer && |
2037 | 0 | (ggml_n_dims(op->src[0]) == 2) && |
2038 | 0 | op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() && |
2039 | 0 | ggml_repack_get_optimal_repack_type(op->src[0]) |
2040 | 0 | ) { |
2041 | 0 | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
2042 | 0 | return false; |
2043 | 0 | } |
2044 | 0 | if (op->src[1]->type == GGML_TYPE_F32) { |
2045 | 0 | return true; |
2046 | 0 | } |
2047 | | //if (op->src[1]->type == GGML_TYPE_Q8_0) { |
2048 | | // return true; |
2049 | | //} |
2050 | | // may be possible if Q8_0 packed... |
2051 | 0 | } else if (op->op == GGML_OP_MUL_MAT_ID |
2052 | 0 | && op->src[0]->buffer |
2053 | 0 | && (ggml_n_dims(op->src[0]) == 3) |
2054 | 0 | && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() |
2055 | 0 | && ggml_repack_get_optimal_repack_type(op->src[0]) |
2056 | 0 | ) { |
2057 | 0 | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
2058 | 0 | return false; |
2059 | 0 | } |
2060 | 0 | if (op->src[1]->type == GGML_TYPE_F32) { |
2061 | 0 | return true; |
2062 | 0 | } |
2063 | | //if (op->src[1]->type == GGML_TYPE_Q8_0) { |
2064 | | // return true; |
2065 | | //} |
2066 | 0 | } |
2067 | 0 | return false; |
2068 | 0 | } |
2069 | | |
2070 | 0 | ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { |
2071 | 0 | if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { |
2072 | 0 | if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { |
2073 | 0 | return (ggml::cpu::tensor_traits *) op->src[0]->extra; |
2074 | 0 | } |
2075 | 0 | } |
2076 | 0 | return nullptr; |
2077 | 0 | } |
2078 | | }; |
2079 | | } // namespace ggml::cpu::repack |
2080 | | |
2081 | 0 | ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) { |
2082 | 0 | static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = { |
2083 | 0 | /* .iface = */ { |
2084 | 0 | /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name, |
2085 | 0 | /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer, |
2086 | 0 | /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment, |
2087 | 0 | /* .get_max_size = */ nullptr, // defaults to SIZE_MAX |
2088 | 0 | /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes |
2089 | 0 | /* .is_host = */ nullptr, |
2090 | 0 | }, |
2091 | 0 | /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
2092 | 0 | /* .context = */ new ggml::cpu::repack::extra_buffer_type(), |
2093 | 0 | }; |
2094 | |
|
2095 | 0 | return &ggml_backend_cpu_buffer_type_repack; |
2096 | 0 | } |