Coverage Report

Created: 2026-01-11 07:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp
Line
Count
Source
1
#define GGML_COMMON_IMPL_CPP
2
#define GGML_COMMON_DECL_CPP
3
#include "ggml-common.h"
4
#include "ggml-backend-impl.h"
5
6
#include "ggml-impl.h"
7
#include "ggml-cpu.h"
8
#include "ggml-cpu-impl.h"
9
#include "simd-mappings.h"
10
#include "traits.h"
11
12
#include "arch-fallback.h"
13
14
#include <cmath>
15
#include <cstring>
16
#include <cassert>
17
#include <cstdio>  // for GGML_ASSERT
18
19
#include "repack.h"
20
21
#if defined(__GNUC__)
22
#pragma GCC diagnostic ignored "-Woverlength-strings"
23
#endif
24
25
0
#define UNUSED GGML_UNUSED
26
27
0
static inline int nearest_int(float fval) {
28
0
    assert(fabsf(fval) <= 4194303.f);
29
0
    float val = fval + 12582912.f;
30
0
    int i; memcpy(&i, &val, sizeof(int));
31
0
    return (i & 0x007fffff) - 0x00400000;
32
0
}
33
34
// Functions to create the interleaved data layout formats
35
36
// interleave 4 block_q4_0s in blocks of blck_size_interleave
37
// returns an interleaved block_q4_0x4
38
// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
39
// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
40
//
41
// - in                  : an array of block_q4_0 pointers
42
// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
43
//                         blck_size_interleave bytes
44
// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
45
//                         from bias offset form to pure sign form (this saves subtract
46
//                         operations durin unpacking)
47
//
48
49
extern "C" {
50
51
0
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
52
0
    assert(QK8_0 == 32);
53
0
    assert(k % QK8_0 == 0);
54
0
    const int nb = k / QK8_0;
55
56
0
    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
57
58
    // scalar
59
0
    const int blck_size_interleave = 4;
60
0
    float srcv[4][QK8_0];
61
0
    float id[4];
62
63
0
    for (int i = 0; i < nb; i++) {
64
0
        for (int row_iter = 0; row_iter < 4; row_iter++) {
65
0
            float amax = 0.0f; // absolute max
66
67
0
            for (int j = 0; j < QK8_0; j++) {
68
0
                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
69
0
                amax = MAX(amax, fabsf(srcv[row_iter][j]));
70
0
            }
71
72
0
            const float d = amax / ((1 << 7) - 1);
73
0
            id[row_iter] = d ? 1.0f / d : 0.0f;
74
75
0
            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
76
0
        }
77
78
0
        for (int j = 0; j < QK8_0 * 4; j++) {
79
0
            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
80
0
            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
81
0
            src_offset += (j % blck_size_interleave);
82
83
0
            float x0 = srcv[src_id][src_offset] * id[src_id];
84
0
            y[i].qs[j] = roundf(x0);
85
0
        }
86
0
    }
87
0
}
88
89
0
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
90
0
    assert(QK8_0 == 32);
91
0
    assert(k % QK8_0 == 0);
92
0
    const int nb = k / QK8_0;
93
94
0
    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
95
96
    // scalar
97
0
    const int blck_size_interleave = 8;
98
0
    float srcv[4][QK8_0];
99
0
    float id[4];
100
101
0
    for (int i = 0; i < nb; i++) {
102
0
        for (int row_iter = 0; row_iter < 4; row_iter++) {
103
0
            float amax = 0.0f; // absolute max
104
105
0
            for (int j = 0; j < QK8_0; j++) {
106
0
                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
107
0
                amax = MAX(amax, fabsf(srcv[row_iter][j]));
108
0
            }
109
110
0
            const float d = amax / ((1 << 7) - 1);
111
0
            id[row_iter] = d ? 1.0f / d : 0.0f;
112
113
0
            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
114
0
        }
115
116
0
        for (int j = 0; j < QK8_0 * 4; j++) {
117
0
            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
118
0
            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
119
0
            src_offset += (j % blck_size_interleave);
120
121
0
            float x0 = srcv[src_id][src_offset] * id[src_id];
122
0
            y[i].qs[j] = roundf(x0);
123
0
        }
124
0
    }
125
0
}
126
127
128
0
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
129
0
    assert(QK_K == 256);
130
0
    assert(k % QK_K == 0);
131
0
    const int nb = k / QK_K;
132
133
0
    block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
134
135
    // scalar
136
0
    const int blck_size_interleave = 4;
137
0
    float srcv[4][QK_K];
138
0
    float iscale[4];
139
140
0
    for (int i = 0; i < nb; i++) {
141
0
        for (int row_iter = 0; row_iter < 4; row_iter++) {
142
0
            float amax = 0.0f; // absolute max
143
0
            float max = 0;
144
145
0
            for (int j = 0; j < QK_K; j++) {
146
0
                srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
147
                // Update the maximum value of the corresponding super block
148
0
                if(amax < fabsf(srcv[row_iter][j])) {
149
0
                    amax = fabsf(srcv[row_iter][j]);
150
0
                    max = srcv[row_iter][j];
151
0
                }
152
0
            }
153
154
0
            iscale[row_iter] = amax ? -127.f/max : 0;
155
156
0
            y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
157
0
        }
158
159
0
        for (int j = 0; j < QK_K / 4; j++) {
160
0
            y[i].bsums[j] = 0;
161
0
        }
162
163
        // Quants values are interleaved in sequence of four bytes from corresponding super blocks
164
        // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
165
        // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
166
0
        for (int j = 0; j < QK_K * 4; j++) {
167
0
            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
168
0
            int src_id     = (j % (4 * blck_size_interleave)) / blck_size_interleave;
169
0
            src_offset += (j % blck_size_interleave);
170
0
            int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
171
172
0
            float x0 = srcv[src_id][src_offset] * iscale[src_id];
173
0
            y[i].qs[j] = nearest_int(x0);
174
0
            y[i].bsums[index] += y[i].qs[j];
175
0
        }
176
0
    }
177
0
}
178
179
0
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
180
0
    assert(QK_K == 256);
181
0
    assert(k % QK_K == 0);
182
0
    const int nb = k / QK_K;
183
184
0
    block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
185
186
    // scalar
187
0
    const int blck_size_interleave = 8;
188
0
    float srcv[4][QK_K];
189
0
    float iscale[4];
190
191
0
    for (int i = 0; i < nb; i++) {
192
0
        for (int row_iter = 0; row_iter < 4; row_iter++) {
193
0
            float amax = 0.0f; // absolute max
194
0
            float max = 0;
195
196
0
            for (int j = 0; j < QK_K; j++) {
197
0
                srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
198
                // Update the maximum value of the corresponding super block
199
0
                if(amax < fabsf(srcv[row_iter][j])) {
200
0
                    amax = fabsf(srcv[row_iter][j]);
201
0
                    max = srcv[row_iter][j];
202
0
                }
203
0
            }
204
205
0
            iscale[row_iter] = amax ? -127.f/max : 0;
206
207
0
            y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
208
0
        }
209
210
0
        for (int j = 0; j < QK_K / 4; j++) {
211
0
            y[i].bsums[j] = 0;
212
0
        }
213
214
        // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
215
        // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
216
        // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
217
0
        for (int j = 0; j < QK_K * 4; j++) {
218
0
            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
219
0
            int src_id     = (j % (4 * blck_size_interleave)) / blck_size_interleave;
220
0
            src_offset += (j % blck_size_interleave);
221
0
            int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
222
223
0
            float x0 = srcv[src_id][src_offset] * iscale[src_id];
224
0
            y[i].qs[j] = nearest_int(x0);
225
0
            y[i].bsums[index] += y[i].qs[j];
226
0
        }
227
0
    }
228
0
}
229
230
} // extern "C"
231
232
template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
233
void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
234
235
0
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
236
0
    assert(nrow == 4);
237
0
    UNUSED(nrow);
238
0
    ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
239
0
}
240
241
0
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
242
0
    assert(nrow == 4);
243
0
    UNUSED(nrow);
244
0
    ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
245
0
}
246
247
0
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
248
0
    assert(nrow == 4);
249
0
    UNUSED(nrow);
250
0
    ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
251
0
}
252
253
0
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
254
0
    assert(nrow == 4);
255
0
    UNUSED(nrow);
256
0
    ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
257
0
}
258
259
extern "C" {
260
261
0
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
262
0
    const int qk = QK8_0;
263
0
    const int nb = n / qk;
264
0
    const int ncols_interleaved = 4;
265
0
    const int blocklen = 4;
266
267
0
    assert(nr == 1);
268
0
    assert(n % qk == 0);
269
0
    assert(nc % ncols_interleaved == 0);
270
271
0
    UNUSED(s);
272
0
    UNUSED(bs);
273
0
    UNUSED(vx);
274
0
    UNUSED(vy);
275
0
    UNUSED(nr);
276
0
    UNUSED(nc);
277
0
    UNUSED(nb);
278
0
    UNUSED(ncols_interleaved);
279
0
    UNUSED(blocklen);
280
281
0
    float sumf[4];
282
0
    int sumi;
283
284
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
285
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
286
0
        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
287
288
0
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
289
0
        for (int l = 0; l < nb; l++) {
290
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
291
0
                for (int j = 0; j < ncols_interleaved; j++) {
292
0
                    sumi = 0;
293
0
                    for (int i = 0; i < blocklen; ++i) {
294
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
295
0
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
296
0
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
297
0
                    }
298
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
299
0
                }
300
0
            }
301
0
        }
302
0
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
303
0
    }
304
0
}
305
306
0
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
307
0
    const int qk = QK8_0;
308
0
    const int nb = n / qk;
309
0
    const int ncols_interleaved = 4;
310
0
    const int blocklen = 8;
311
312
0
    assert (n % qk == 0);
313
0
    assert (nc % ncols_interleaved == 0);
314
315
0
    UNUSED(s);
316
0
    UNUSED(bs);
317
0
    UNUSED(vx);
318
0
    UNUSED(vy);
319
0
    UNUSED(nr);
320
0
    UNUSED(nc);
321
0
    UNUSED(nb);
322
0
    UNUSED(ncols_interleaved);
323
0
    UNUSED(blocklen);
324
325
0
    float sumf[4];
326
0
    int sumi;
327
328
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
329
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
330
0
        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
331
332
0
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
333
0
        for (int l = 0; l < nb; l++) {
334
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
335
0
                for (int j = 0; j < ncols_interleaved; j++) {
336
0
                    sumi = 0;
337
0
                    for (int i = 0; i < blocklen; ++i) {
338
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
339
0
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
340
0
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
341
0
                    }
342
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
343
0
                }
344
0
            }
345
0
        }
346
0
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
347
0
    }
348
0
}
349
350
0
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
351
0
    const int qk = QK8_0;
352
0
    const int nb = n / qk;
353
0
    const int ncols_interleaved = 8;
354
0
    const int blocklen = 8;
355
356
0
    assert (n % qk == 0);
357
0
    assert (nc % ncols_interleaved == 0);
358
359
0
    UNUSED(s);
360
0
    UNUSED(bs);
361
0
    UNUSED(vx);
362
0
    UNUSED(vy);
363
0
    UNUSED(nr);
364
0
    UNUSED(nc);
365
0
    UNUSED(nb);
366
0
    UNUSED(ncols_interleaved);
367
0
    UNUSED(blocklen);
368
369
0
    float sumf[8];
370
0
    int sumi;
371
372
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
373
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
374
0
        const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
375
376
0
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
377
0
        for (int l = 0; l < nb; l++) {
378
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
379
0
                for (int j = 0; j < ncols_interleaved; j++) {
380
0
                    sumi = 0;
381
0
                    for (int i = 0; i < blocklen; ++i) {
382
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
383
0
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
384
0
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
385
0
                    }
386
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
387
0
                }
388
0
            }
389
0
        }
390
0
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
391
0
    }
392
0
}
393
394
0
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
395
0
    const int qk = QK_K;
396
0
    const int nb = n / qk;
397
0
    const int ncols_interleaved = 8;
398
0
    const int blocklen = 4;
399
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
400
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
401
0
    static const uint32_t kmask3 = 0x03030303;
402
403
0
    assert (n % qk == 0);
404
0
    assert (nc % ncols_interleaved == 0);
405
406
0
    UNUSED(bs);
407
0
    UNUSED(nr);
408
409
0
    float sumf[8];
410
0
    float sum_minf[8];
411
0
    uint32_t utmp[32];
412
0
    int sumi1;
413
0
    int sumi2;
414
0
    int sumi;
415
416
0
    const block_q8_K * a_ptr = (const block_q8_K *) vy;
417
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
418
0
        const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
419
420
0
        for (int j = 0; j < ncols_interleaved; j++) {
421
0
            sumf[j] = 0.0;
422
0
            sum_minf[j] = 0.0;
423
0
        }
424
0
        for (int l = 0; l < nb; l++) {
425
0
            for (int sb = 0; sb < 8; sb++) {
426
0
                memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
427
0
                utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
428
0
                const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
429
0
                utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
430
0
                utmp[sb * 4 + 2] = uaux_0;
431
0
                utmp[sb * 4 + 0] &= kmask1;
432
0
            }
433
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
434
0
                uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
435
0
                uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
436
0
                for (int j = 0; j < ncols_interleaved; j++) {
437
0
                    sumi1 = 0;
438
0
                    sumi2 = 0;
439
0
                    sumi = 0;
440
0
                    for (int i = 0; i < blocklen; ++i) {
441
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
442
0
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
443
0
                        sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
444
0
                        sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
445
0
                        sumi1 = sumi1 * scales_0[j];
446
0
                        sumi2 = sumi2 * scales_1[j];
447
0
                        sumi += sumi1 + sumi2;
448
0
                    }
449
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
450
0
                }
451
0
            }
452
0
            for (int sb = 0; sb < 8; sb++) {
453
0
                uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
454
0
                for (int j = 0; j < ncols_interleaved; j++) {
455
0
                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
456
0
                }
457
0
            }
458
0
        }
459
0
        for (int j = 0; j < ncols_interleaved; j++) {
460
0
            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
461
0
        }
462
0
    }
463
0
}
464
465
0
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
466
0
    const int qk = QK_K;
467
0
    const int nb = n / qk;
468
0
    const int ncols_interleaved = 8;
469
0
    const int blocklen = 8;
470
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
471
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
472
0
    static const uint32_t kmask3 = 0x03030303;
473
474
0
    assert (n % qk == 0);
475
0
    assert (nc % ncols_interleaved == 0);
476
477
0
    UNUSED(s);
478
0
    UNUSED(bs);
479
0
    UNUSED(vx);
480
0
    UNUSED(vy);
481
0
    UNUSED(nr);
482
0
    UNUSED(nc);
483
0
    UNUSED(nb);
484
0
    UNUSED(ncols_interleaved);
485
0
    UNUSED(blocklen);
486
487
0
    float sumf[8];
488
0
    float sum_minf[8];
489
0
    uint32_t utmp[32];
490
0
    int sumi1;
491
0
    int sumi2;
492
0
    int sumi;
493
494
0
    const block_q8_K * a_ptr = (const block_q8_K *) vy;
495
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
496
0
        const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
497
498
0
        for (int j = 0; j < ncols_interleaved; j++) {
499
0
            sumf[j] = 0.0;
500
0
            sum_minf[j] = 0.0;
501
0
        }
502
0
        for (int l = 0; l < nb; l++) {
503
0
            for (int sb = 0; sb < 8; sb++) {
504
0
                memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
505
0
                utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
506
0
                const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
507
0
                utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
508
0
                utmp[sb * 4 + 2] = uaux_0;
509
0
                utmp[sb * 4 + 0] &= kmask1;
510
0
            }
511
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
512
0
                uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
513
0
                uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
514
0
                for (int j = 0; j < ncols_interleaved; j++) {
515
0
                    sumi1 = 0;
516
0
                    sumi2 = 0;
517
0
                    sumi = 0;
518
0
                    for (int i = 0; i < blocklen; ++i) {
519
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
520
0
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
521
0
                        sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
522
0
                        sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
523
0
                        sumi1 = sumi1 * scales_0[j];
524
0
                        sumi2 = sumi2 * scales_1[j];
525
0
                        sumi += sumi1 + sumi2;
526
0
                    }
527
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
528
0
                }
529
0
            }
530
0
            for (int sb = 0; sb < 8; sb++) {
531
0
                uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
532
0
                for (int j = 0; j < ncols_interleaved; j++) {
533
0
                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
534
0
                }
535
0
            }
536
0
        }
537
0
        for (int j = 0; j < ncols_interleaved; j++) {
538
0
            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
539
0
        }
540
0
    }
541
0
}
542
543
0
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
544
0
    const int qk = QK_K;
545
0
    const int nb = n / qk;
546
0
    const int ncols_interleaved = 8;
547
0
    const int blocklen = 8;
548
549
0
    assert (n % qk == 0);
550
0
    assert (nc % ncols_interleaved == 0);
551
552
0
    UNUSED(s);
553
0
    UNUSED(bs);
554
0
    UNUSED(vx);
555
0
    UNUSED(vy);
556
0
    UNUSED(nr);
557
0
    UNUSED(nc);
558
0
    UNUSED(nb);
559
0
    UNUSED(ncols_interleaved);
560
0
    UNUSED(blocklen);
561
562
0
    float sumf[8];
563
0
    float sum_minf[8];
564
0
    int sumi1,sumi2,sumi3,sumi4;
565
0
    int sumi;
566
567
0
    const block_q8_K * a_ptr = (const block_q8_K *)vy;
568
0
    for(int x = 0; x < nc / ncols_interleaved; x++) {
569
0
        const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
570
0
        for (int j = 0; j < ncols_interleaved; j++) {
571
0
            sumf[j] = 0.0;
572
0
            sum_minf[j] = 0.0;
573
0
        }
574
0
        for (int l = 0; l < nb; l++) {
575
0
            for (int k = 0; k < (qk / (4 * blocklen)); k++) {
576
0
                const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
577
0
                const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
578
0
                const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
579
0
                const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
580
0
                for (int j = 0; j < ncols_interleaved; j++) {
581
0
                    sumi1 = 0;
582
0
                    sumi2 = 0;
583
0
                    sumi3 = 0;
584
0
                    sumi4 = 0;
585
0
                    sumi = 0;
586
0
                    int offset = ((k / 2) % 2) + j * 2;
587
0
                    for (int i = 0; i < blocklen; ++i){
588
0
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
589
0
                        const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
590
0
                        const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
591
0
                        const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
592
0
                        sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
593
0
                        sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
594
0
                        sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
595
0
                        sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
596
597
0
                        sumi1 = sumi1 * (scales_0[offset] & 0xF);
598
0
                        sumi2 = sumi2 * (scales_1[offset] & 0xF);
599
0
                        sumi3 = sumi3 * (scales_2[offset] & 0xF);
600
0
                        sumi4 = sumi4 * (scales_3[offset] & 0xF);
601
0
                        sumi += sumi1 + sumi2 + sumi3 + sumi4;
602
0
                    }
603
0
                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
604
0
                }
605
0
            }
606
0
            for(int sb = 0; sb < 8; sb++) {
607
0
                const uint8_t *mins = b_ptr[l].scales + sb * 16;
608
0
                for(int j = 0; j < ncols_interleaved; j++){
609
0
                    sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
610
0
                }
611
0
            }
612
0
        }
613
0
        for (int j = 0; j < ncols_interleaved; j++) {
614
0
            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
615
0
        }
616
0
    }
617
0
}
618
619
0
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
620
0
    const int qk = QK8_0;
621
0
    const int nb = n / qk;
622
0
    const int ncols_interleaved = 4;
623
0
    const int blocklen = 4;
624
625
0
    assert(nr == 1);
626
0
    assert(n % qk == 0);
627
0
    assert(nc % ncols_interleaved == 0);
628
629
0
    UNUSED(bs);
630
0
    UNUSED(nr);
631
632
0
    float sumf[4];
633
0
    int sumi;
634
635
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
636
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
637
0
        const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
638
639
0
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
640
0
        for (int l = 0; l < nb; l++) {
641
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
642
0
                for (int j = 0; j < ncols_interleaved; j++) {
643
0
                    sumi = 0;
644
0
                    for (int i = 0; i < blocklen; ++i) {
645
0
                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
646
0
                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
647
0
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
648
0
                    }
649
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
650
0
                }
651
0
            }
652
0
        }
653
0
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
654
0
    }
655
0
}
656
657
0
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
658
0
    const int qk = QK8_0;
659
0
    const int nb = n / qk;
660
0
    const int ncols_interleaved = 8;
661
0
    const int blocklen = 8;
662
663
0
    assert(nr == 1);
664
0
    assert(n % qk == 0);
665
0
    assert(nc % ncols_interleaved == 0);
666
667
0
    UNUSED(bs);
668
0
    UNUSED(nr);
669
670
0
    float sumf[8];
671
0
    int sumi;
672
673
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
674
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
675
0
        const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
676
677
0
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
678
0
        for (int l = 0; l < nb; l++) {
679
0
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
680
0
                for (int j = 0; j < ncols_interleaved; j++) {
681
0
                    sumi = 0;
682
0
                    for (int i = 0; i < blocklen; ++i) {
683
0
                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
684
0
                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
685
0
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
686
0
                    }
687
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
688
0
                }
689
0
            }
690
0
        }
691
0
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
692
0
    }
693
0
}
694
695
void ggml_gemv_q8_0_4x4_q8_0_generic(int                        n,
696
                                     float * GGML_RESTRICT      s,
697
                                     size_t                     bs,
698
                                     const void * GGML_RESTRICT vx,
699
                                     const void * GGML_RESTRICT vy,
700
                                     int                        nr,
701
0
                                     int                        nc) {
702
0
    const int qk                = QK8_0;
703
0
    const int nb                = n / qk;
704
0
    const int ncols_interleaved = 4;
705
0
    const int blocklen          = 4;
706
707
0
    assert(nr == 1);
708
0
    assert(n % qk == 0);
709
0
    assert(nc % ncols_interleaved == 0);
710
711
0
    UNUSED(bs);
712
0
    UNUSED(nr);
713
714
0
    float sumf[4];
715
0
    int   sumi;
716
717
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
718
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
719
0
        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
720
721
0
        for (int j = 0; j < ncols_interleaved; j++) {
722
0
            sumf[j] = 0.0;
723
0
        }
724
0
        for (int l = 0; l < nb; l++) {
725
0
            for (int k = 0; k < (qk / blocklen); k++) {
726
0
                for (int j = 0; j < ncols_interleaved; j++) {
727
0
                    sumi = 0;
728
0
                    for (int i = 0; i < blocklen; ++i) {
729
0
                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
730
0
                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
731
0
                    }
732
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
733
0
                }
734
0
            }
735
0
        }
736
0
        for (int j = 0; j < ncols_interleaved; j++) {
737
0
            s[x * ncols_interleaved + j] = sumf[j];
738
0
        }
739
0
    }
740
0
}
741
742
void ggml_gemv_q8_0_4x8_q8_0_generic(int                        n,
743
                                     float * GGML_RESTRICT      s,
744
                                     size_t                     bs,
745
                                     const void * GGML_RESTRICT vx,
746
                                     const void * GGML_RESTRICT vy,
747
                                     int                        nr,
748
0
                                     int                        nc) {
749
0
    const int qk                = QK8_0;
750
0
    const int nb                = n / qk;
751
0
    const int ncols_interleaved = 4;
752
0
    const int blocklen          = 8;
753
754
0
    assert(nr == 1);
755
0
    assert(n % qk == 0);
756
0
    assert(nc % ncols_interleaved == 0);
757
758
0
    UNUSED(bs);
759
0
    UNUSED(nr);
760
761
0
    float sumf[4];
762
0
    int   sumi;
763
764
0
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
765
0
    for (int x = 0; x < nc / ncols_interleaved; x++) {
766
0
        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
767
768
0
        for (int j = 0; j < ncols_interleaved; j++) {
769
0
            sumf[j] = 0.0;
770
0
        }
771
0
        for (int l = 0; l < nb; l++) {
772
0
            for (int k = 0; k < (qk / blocklen); k++) {
773
0
                for (int j = 0; j < ncols_interleaved; j++) {
774
0
                    sumi = 0;
775
0
                    for (int i = 0; i < blocklen; ++i) {
776
0
                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
777
0
                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
778
0
                    }
779
0
                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
780
0
                }
781
0
            }
782
0
        }
783
0
        for (int j = 0; j < ncols_interleaved; j++) {
784
0
            s[x * ncols_interleaved + j] = sumf[j];
785
0
        }
786
0
    }
787
0
}
788
789
0
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
790
0
    const int qk = QK8_0;
791
0
    const int nb = n / qk;
792
0
    const int ncols_interleaved = 4;
793
0
    const int blocklen = 4;
794
795
0
    assert (n % qk == 0);
796
0
    assert (nr % 4 == 0);
797
0
    assert (nc % ncols_interleaved == 0);
798
799
0
    UNUSED(s);
800
0
    UNUSED(bs);
801
0
    UNUSED(vx);
802
0
    UNUSED(vy);
803
0
    UNUSED(nr);
804
0
    UNUSED(nc);
805
0
    UNUSED(nb);
806
0
    UNUSED(ncols_interleaved);
807
0
    UNUSED(blocklen);
808
809
0
    {
810
0
        float sumf[4][4];
811
0
        int sumi;
812
813
0
        for (int y = 0; y < nr / 4; y++) {
814
0
            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
815
0
            for (int x = 0; x < nc / ncols_interleaved; x++) {
816
0
                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
817
0
                for (int m = 0; m < 4; m++) {
818
0
                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
819
0
                }
820
0
                for (int l = 0; l < nb; l++) {
821
0
                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
822
0
                        for (int m = 0; m < 4; m++) {
823
0
                            for (int j = 0; j < ncols_interleaved; j++) {
824
0
                                sumi = 0;
825
0
                                for (int i = 0; i < blocklen; ++i) {
826
0
                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
827
0
                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
828
0
                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
829
0
                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
830
0
                                }
831
0
                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
832
0
                            }
833
0
                        }
834
0
                    }
835
0
                }
836
0
                for (int m = 0; m < 4; m++) {
837
0
                    for (int j = 0; j < ncols_interleaved; j++)
838
0
                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
839
0
                }
840
0
            }
841
0
        }
842
0
    }
843
0
}
844
845
0
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
846
0
    const int qk = QK8_0;
847
0
    const int nb = n / qk;
848
0
    const int ncols_interleaved = 4;
849
0
    const int blocklen = 8;
850
851
0
    assert (n % qk == 0);
852
0
    assert (nr % 4 == 0);
853
0
    assert (nc % ncols_interleaved == 0);
854
855
0
    UNUSED(s);
856
0
    UNUSED(bs);
857
0
    UNUSED(vx);
858
0
    UNUSED(vy);
859
0
    UNUSED(nr);
860
0
    UNUSED(nc);
861
0
    UNUSED(nb);
862
0
    UNUSED(ncols_interleaved);
863
0
    UNUSED(blocklen);
864
865
0
    float sumf[4][4];
866
0
    int sumi;
867
868
0
    for (int y = 0; y < nr / 4; y++) {
869
0
        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
870
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
871
0
            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
872
0
            for (int m = 0; m < 4; m++) {
873
0
                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
874
0
            }
875
0
            for (int l = 0; l < nb; l++) {
876
0
                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
877
0
                    for (int m = 0; m < 4; m++) {
878
0
                        for (int j = 0; j < ncols_interleaved; j++) {
879
0
                            sumi = 0;
880
0
                            for (int i = 0; i < blocklen; ++i) {
881
0
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
882
0
                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
883
0
                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
884
0
                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
885
0
                            }
886
0
                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
887
0
                        }
888
0
                    }
889
0
                }
890
0
            }
891
0
            for (int m = 0; m < 4; m++) {
892
0
                for (int j = 0; j < ncols_interleaved; j++)
893
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
894
0
            }
895
0
        }
896
0
    }
897
0
}
898
899
0
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
900
0
    const int qk = QK8_0;
901
0
    const int nb = n / qk;
902
0
    const int ncols_interleaved = 8;
903
0
    const int blocklen = 8;
904
905
0
    assert (n % qk == 0);
906
0
    assert (nr % 4 == 0);
907
0
    assert (nc % ncols_interleaved == 0);
908
909
0
    UNUSED(s);
910
0
    UNUSED(bs);
911
0
    UNUSED(vx);
912
0
    UNUSED(vy);
913
0
    UNUSED(nr);
914
0
    UNUSED(nc);
915
0
    UNUSED(nb);
916
0
    UNUSED(ncols_interleaved);
917
0
    UNUSED(blocklen);
918
919
0
    float sumf[4][8];
920
0
    int sumi;
921
922
0
    for (int y = 0; y < nr / 4; y++) {
923
0
        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
924
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
925
0
            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
926
0
            for (int m = 0; m < 4; m++) {
927
0
                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
928
0
            }
929
0
            for (int l = 0; l < nb; l++) {
930
0
                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
931
0
                    for (int m = 0; m < 4; m++) {
932
0
                        for (int j = 0; j < ncols_interleaved; j++) {
933
0
                            sumi = 0;
934
0
                            for (int i = 0; i < blocklen; ++i) {
935
0
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
936
0
                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
937
0
                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
938
0
                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
939
0
                            }
940
0
                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
941
0
                        }
942
0
                    }
943
0
                }
944
0
            }
945
0
            for (int m = 0; m < 4; m++) {
946
0
                for (int j = 0; j < ncols_interleaved; j++)
947
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
948
0
            }
949
0
        }
950
0
    }
951
0
}
952
953
0
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
954
0
    const int qk = QK_K;
955
0
    const int nb = n / qk;
956
0
    const int ncols_interleaved = 8;
957
0
    const int blocklen = 4;
958
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
959
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
960
0
    static const uint32_t kmask3 = 0x03030303;
961
962
0
    assert (n % qk == 0);
963
0
    assert (nr % 4 == 0);
964
0
    assert (nc % ncols_interleaved == 0);
965
966
0
    UNUSED(nb);
967
0
    UNUSED(ncols_interleaved);
968
0
    UNUSED(blocklen);
969
970
0
    float sumf[4][8];
971
0
    float sum_minf[4][8];
972
0
    uint32_t utmp[32];
973
0
    int sumi1;
974
0
    int sumi2;
975
0
    int sumi;
976
977
0
    for (int y = 0; y < nr / 4; y++) {
978
0
        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
979
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
980
0
            const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
981
0
            for (int m = 0; m < 4; m++) {
982
0
                for (int j = 0; j < ncols_interleaved; j++) {
983
0
                    sumf[m][j] = 0.0;
984
0
                    sum_minf[m][j] = 0.0;
985
0
                }
986
0
            }
987
0
            for (int l = 0; l < nb; l++) {
988
0
                for (int sb = 0; sb < 8; sb++) {
989
0
                    memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
990
0
                    utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
991
0
                    const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
992
0
                    utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
993
0
                    utmp[sb * 4 + 2] = uaux_0;
994
0
                    utmp[sb * 4 + 0] &= kmask1;
995
0
                }
996
0
                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
997
0
                    uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
998
0
                    uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
999
0
                    for (int m = 0; m < 4; m++) {
1000
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1001
0
                            sumi1 = 0;
1002
0
                            sumi2 = 0;
1003
0
                            sumi = 0;
1004
0
                            for (int i = 0; i < blocklen; ++i) {
1005
0
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
0
                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
0
                                sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1008
0
                                sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1009
0
                                sumi1 = sumi1 * scales_0[j];
1010
0
                                sumi2 = sumi2 * scales_1[j];
1011
0
                                sumi += sumi1 + sumi2;
1012
0
                            }
1013
0
                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1014
0
                        }
1015
0
                    }
1016
0
                }
1017
0
                for (int sb = 0; sb < 8; sb++) {
1018
0
                    uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1019
0
                    for(int m = 0; m < 4; m++) {
1020
0
                        const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1021
0
                        for(int j = 0; j < ncols_interleaved; j++) {
1022
0
                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1023
0
                        }
1024
0
                    }
1025
0
                }
1026
0
            }
1027
0
            for (int m = 0; m < 4; m++) {
1028
0
                for (int j = 0; j < ncols_interleaved; j++) {
1029
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1030
0
                }
1031
0
            }
1032
0
        }
1033
0
    }
1034
0
}
1035
1036
0
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1037
0
    const int qk = QK_K;
1038
0
    const int nb = n / qk;
1039
0
    const int ncols_interleaved = 8;
1040
0
    const int blocklen = 8;
1041
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
1042
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
1043
0
    static const uint32_t kmask3 = 0x03030303;
1044
1045
0
    assert (n % qk == 0);
1046
0
    assert (nr % 4 == 0);
1047
0
    assert (nc % ncols_interleaved == 0);
1048
1049
0
    UNUSED(s);
1050
0
    UNUSED(bs);
1051
0
    UNUSED(vx);
1052
0
    UNUSED(vy);
1053
0
    UNUSED(nr);
1054
0
    UNUSED(nc);
1055
0
    UNUSED(nb);
1056
0
    UNUSED(ncols_interleaved);
1057
0
    UNUSED(blocklen);
1058
1059
0
    float sumf[4][8];
1060
0
    float sum_minf[4][8];
1061
0
    uint32_t utmp[32];
1062
0
    int sumi1;
1063
0
    int sumi2;
1064
0
    int sumi;
1065
1066
0
    for (int y = 0; y < nr / 4; y++) {
1067
0
        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1068
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
1069
0
            const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1070
0
            for (int m = 0; m < 4; m++) {
1071
0
                for (int j = 0; j < ncols_interleaved; j++) {
1072
0
                    sumf[m][j] = 0.0;
1073
0
                    sum_minf[m][j] = 0.0;
1074
0
                }
1075
0
            }
1076
0
            for (int l = 0; l < nb; l++) {
1077
0
                for (int sb = 0; sb < 8; sb++) {
1078
0
                    memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1079
0
                    utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1080
0
                    const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1081
0
                    utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1082
0
                    utmp[sb * 4 + 2] = uaux_0;
1083
0
                    utmp[sb * 4 + 0] &= kmask1;
1084
0
                }
1085
0
                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1086
0
                    uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1087
0
                    uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1088
0
                    for (int m = 0; m < 4; m++) {
1089
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1090
0
                            sumi1 = 0;
1091
0
                            sumi2 = 0;
1092
0
                            sumi = 0;
1093
0
                            for (int i = 0; i < blocklen; ++i) {
1094
0
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1095
0
                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1096
0
                                sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1097
0
                                sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1098
0
                                sumi1 = sumi1 * scales_0[j];
1099
0
                                sumi2 = sumi2 * scales_1[j];
1100
0
                                sumi += sumi1 + sumi2;
1101
0
                            }
1102
0
                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1103
0
                        }
1104
0
                    }
1105
0
                }
1106
0
                for (int sb = 0; sb < 8; sb++) {
1107
0
                    uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1108
0
                    for(int m = 0; m < 4; m++) {
1109
0
                        const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1110
0
                        for(int j = 0; j < ncols_interleaved; j++) {
1111
0
                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1112
0
                        }
1113
0
                    }
1114
0
                }
1115
0
            }
1116
0
            for (int m = 0; m < 4; m++) {
1117
0
                for (int j = 0; j < ncols_interleaved; j++) {
1118
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1119
0
                }
1120
0
            }
1121
0
        }
1122
0
    }
1123
0
}
1124
1125
0
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1126
0
    const int qk = QK_K;
1127
0
    const int nb = n / qk;
1128
0
    const int ncols_interleaved = 8;
1129
0
    const int blocklen = 8;
1130
1131
0
    assert (n % qk == 0);
1132
0
    assert (nr % 4 == 0);
1133
0
    assert (nc % ncols_interleaved == 0);
1134
1135
0
    UNUSED(s);
1136
0
    UNUSED(bs);
1137
0
    UNUSED(vx);
1138
0
    UNUSED(vy);
1139
0
    UNUSED(nr);
1140
0
    UNUSED(nc);
1141
0
    UNUSED(nb);
1142
0
    UNUSED(ncols_interleaved);
1143
0
    UNUSED(blocklen);
1144
1145
0
    float sumf[4][8];
1146
0
    float sum_minf[4][8];
1147
0
    int sumi1, sumi2, sumi3, sumi4;
1148
0
    int sumi;
1149
1150
0
    for (int y = 0; y < nr / 4; y++) {
1151
0
        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1152
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
1153
0
            const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
1154
0
            for (int m = 0; m < 4; m++) {
1155
0
                for (int j = 0; j < ncols_interleaved; j++) {
1156
0
                    sumf[m][j] = 0.0;
1157
0
                    sum_minf[m][j] = 0.0;
1158
0
                }
1159
0
            }
1160
0
            for (int l = 0; l < nb; l++) {
1161
0
                for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1162
1163
0
                    const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1164
0
                    const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1165
0
                    const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1166
0
                    const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
1167
0
                    for (int m = 0; m < 4; m++) {
1168
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1169
0
                            sumi1 = 0;
1170
0
                            sumi2 = 0;
1171
0
                            sumi3 = 0;
1172
0
                            sumi4 = 0;
1173
0
                            sumi = 0;
1174
0
                            int offset = ((k / 2) % 2) + j * 2;
1175
0
                            for (int i = 0; i < blocklen; ++i){
1176
0
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1177
0
                                const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1178
0
                                const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1179
0
                                const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1180
0
                                sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1181
0
                                sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1182
0
                                sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
1183
0
                                sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
1184
0
                                sumi1 = sumi1 * (scales_0[offset] & 0xF);
1185
0
                                sumi2 = sumi2 * (scales_1[offset] & 0xF);
1186
0
                                sumi3 = sumi3 * (scales_2[offset] & 0xF);
1187
0
                                sumi4 = sumi4 * (scales_3[offset] & 0xF);
1188
0
                                sumi += sumi1 + sumi2 + sumi3 + sumi4;
1189
0
                            }
1190
0
                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1191
0
                        }
1192
0
                    }
1193
0
                }
1194
0
                for(int sb = 0; sb < 8; sb++) {
1195
0
                    const uint8_t *mins = b_ptr[l].scales + sb * 16;
1196
0
                    for(int m = 0; m < 4; m++) {
1197
0
                        const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) *  6);
1198
0
                        for(int j = 0; j < ncols_interleaved; j++) {
1199
0
                            int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
1200
0
                            sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1201
0
                        }
1202
0
                    }
1203
0
                }
1204
0
            }
1205
1206
0
            for (int m = 0; m < 4; m++) {
1207
0
                for (int j = 0; j < ncols_interleaved; j++) {
1208
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1209
0
                }
1210
0
            }
1211
0
        }
1212
0
    }
1213
0
}
1214
1215
1216
0
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1217
0
    const int qk = QK8_0;
1218
0
    const int nb = n / qk;
1219
0
    const int ncols_interleaved = 4;
1220
0
    const int blocklen = 4;
1221
1222
0
    assert (n % qk == 0);
1223
0
    assert (nr % 4 == 0);
1224
0
    assert (nc % ncols_interleaved == 0);
1225
1226
0
    UNUSED(s);
1227
0
    UNUSED(bs);
1228
0
    UNUSED(vx);
1229
0
    UNUSED(vy);
1230
0
    UNUSED(nr);
1231
0
    UNUSED(nc);
1232
0
    UNUSED(nb);
1233
0
    UNUSED(ncols_interleaved);
1234
0
    UNUSED(blocklen);
1235
1236
0
    {
1237
0
        float sumf[4][4];
1238
0
        int sumi;
1239
1240
0
        for (int y = 0; y < nr / 4; y++) {
1241
0
            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1242
0
            for (int x = 0; x < nc / ncols_interleaved; x++) {
1243
0
                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1244
0
                for (int m = 0; m < 4; m++) {
1245
0
                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1246
0
                }
1247
0
                for (int l = 0; l < nb; l++) {
1248
0
                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1249
0
                        for (int m = 0; m < 4; m++) {
1250
0
                            for (int j = 0; j < ncols_interleaved; j++) {
1251
0
                                sumi = 0;
1252
0
                                for (int i = 0; i < blocklen; ++i) {
1253
0
                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1254
0
                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1255
0
                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1256
0
                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
1257
0
                                }
1258
0
                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1259
0
                            }
1260
0
                        }
1261
0
                    }
1262
0
                }
1263
0
                for (int m = 0; m < 4; m++) {
1264
0
                    for (int j = 0; j < ncols_interleaved; j++)
1265
0
                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1266
0
                }
1267
0
            }
1268
0
        }
1269
0
    }
1270
0
}
1271
1272
0
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1273
0
    const int qk = QK8_0;
1274
0
    const int nb = n / qk;
1275
0
    const int ncols_interleaved = 8;
1276
0
    const int blocklen = 8;
1277
1278
0
    assert(n % qk == 0);
1279
0
    assert(nr % 4 == 0);
1280
0
    assert(nc % ncols_interleaved == 0);
1281
1282
0
    float sumf[4][8];
1283
0
    int sumi;
1284
1285
0
    for (int y = 0; y < nr / 4; y++) {
1286
0
        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1287
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
1288
0
            const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
1289
0
            for (int m = 0; m < 4; m++) {
1290
0
                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1291
0
            }
1292
0
            for (int l = 0; l < nb; l++) {
1293
0
                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1294
0
                    for (int m = 0; m < 4; m++) {
1295
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1296
0
                            sumi = 0;
1297
0
                            for (int i = 0; i < blocklen; ++i) {
1298
0
                                const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1299
0
                                const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1300
0
                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1301
0
                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
1302
0
                            }
1303
0
                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1304
0
                        }
1305
0
                    }
1306
0
                }
1307
0
            }
1308
0
            for (int m = 0; m < 4; m++) {
1309
0
                for (int j = 0; j < ncols_interleaved; j++)
1310
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1311
0
            }
1312
0
        }
1313
0
    }
1314
0
}
1315
1316
void ggml_gemm_q8_0_4x4_q8_0_generic(int                        n,
1317
                                     float * GGML_RESTRICT      s,
1318
                                     size_t                     bs,
1319
                                     const void * GGML_RESTRICT vx,
1320
                                     const void * GGML_RESTRICT vy,
1321
                                     int                        nr,
1322
0
                                     int                        nc) {
1323
0
    const int qk                = QK8_0;
1324
0
    const int nb                = n / qk;
1325
0
    const int ncols_interleaved = 4;
1326
0
    const int blocklen          = 4;
1327
1328
0
    assert(n % qk == 0);
1329
0
    assert(nr % 4 == 0);
1330
0
    assert(nc % ncols_interleaved == 0);
1331
1332
0
    float sumf[4][4];
1333
0
    int   sumi;
1334
1335
0
    for (int y = 0; y < nr / 4; y++) {
1336
0
        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1337
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
1338
0
            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1339
0
            for (int m = 0; m < 4; m++) {
1340
0
                for (int j = 0; j < ncols_interleaved; j++) {
1341
0
                    sumf[m][j] = 0.0;
1342
0
                }
1343
0
            }
1344
0
            for (int l = 0; l < nb; l++) {
1345
0
                for (int k = 0; k < (qk / blocklen); k++) {
1346
0
                    for (int m = 0; m < 4; m++) {
1347
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1348
0
                            sumi = 0;
1349
0
                            for (int i = 0; i < blocklen; ++i) {
1350
0
                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1351
0
                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1352
0
                            }
1353
0
                            sumf[m][j] +=
1354
0
                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1355
0
                        }
1356
0
                    }
1357
0
                }
1358
0
            }
1359
0
            for (int m = 0; m < 4; m++) {
1360
0
                for (int j = 0; j < ncols_interleaved; j++) {
1361
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1362
0
                }
1363
0
            }
1364
0
        }
1365
0
    }
1366
0
}
1367
1368
void ggml_gemm_q8_0_4x8_q8_0_generic(int                        n,
1369
                                     float * GGML_RESTRICT      s,
1370
                                     size_t                     bs,
1371
                                     const void * GGML_RESTRICT vx,
1372
                                     const void * GGML_RESTRICT vy,
1373
                                     int                        nr,
1374
0
                                     int                        nc) {
1375
0
    const int qk                = QK8_0;
1376
0
    const int nb                = n / qk;
1377
0
    const int ncols_interleaved = 4;
1378
0
    const int blocklen          = 8;
1379
1380
0
    assert(n % qk == 0);
1381
0
    assert(nr % 4 == 0);
1382
0
    assert(nc % ncols_interleaved == 0);
1383
1384
0
    float sumf[4][4];
1385
0
    int   sumi;
1386
1387
0
    for (int y = 0; y < nr / 4; y++) {
1388
0
        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1389
0
        for (int x = 0; x < nc / ncols_interleaved; x++) {
1390
0
            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1391
0
            for (int m = 0; m < 4; m++) {
1392
0
                for (int j = 0; j < ncols_interleaved; j++) {
1393
0
                    sumf[m][j] = 0.0;
1394
0
                }
1395
0
            }
1396
0
            for (int l = 0; l < nb; l++) {
1397
0
                for (int k = 0; k < (qk / blocklen); k++) {
1398
0
                    for (int m = 0; m < 4; m++) {
1399
0
                        for (int j = 0; j < ncols_interleaved; j++) {
1400
0
                            sumi = 0;
1401
0
                            for (int i = 0; i < blocklen; ++i) {
1402
0
                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1403
0
                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1404
0
                            }
1405
0
                            sumf[m][j] +=
1406
0
                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1407
0
                        }
1408
0
                    }
1409
0
                }
1410
0
            }
1411
0
            for (int m = 0; m < 4; m++) {
1412
0
                for (int j = 0; j < ncols_interleaved; j++) {
1413
0
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1414
0
                }
1415
0
            }
1416
0
        }
1417
0
    }
1418
0
}
1419
1420
} // extern "C"
1421
1422
0
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
1423
0
    block_q8_0x4 out;
1424
1425
0
    for (int i = 0; i < 4; i++) {
1426
0
        out.d[i] = in[i].d;
1427
0
    }
1428
1429
0
    const int end = QK8_0 * 4 / blck_size_interleave;
1430
0
    for (int i = 0; i < end; ++i) {
1431
0
        int src_id     = i % 4;
1432
0
        int src_offset = (i / 4) * blck_size_interleave;
1433
0
        int dst_offset = i * blck_size_interleave;
1434
0
        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
1435
0
    }
1436
0
    return out;
1437
0
}
1438
1439
0
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1440
0
    block_q4_0x4 out;
1441
1442
0
    for (int i = 0; i < 4; i++) {
1443
0
        out.d[i] = in[i].d;
1444
0
    }
1445
1446
0
    const int end = QK4_0 * 2 / blck_size_interleave;
1447
1448
0
    if (blck_size_interleave == 8) {
1449
0
        const uint64_t xor_mask = 0x8888888888888888ULL;
1450
0
        for (int i = 0; i < end; ++i) {
1451
0
            int src_id = i % 4;
1452
0
            int src_offset = (i / 4) * blck_size_interleave;
1453
0
            int dst_offset = i * blck_size_interleave;
1454
1455
0
            uint64_t elems;
1456
            // Using memcpy to avoid unaligned memory accesses
1457
0
            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1458
0
            elems ^= xor_mask;
1459
0
            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1460
0
        }
1461
0
    } else if (blck_size_interleave == 4) {
1462
0
        const uint32_t xor_mask = 0x88888888;
1463
0
        for (int i = 0; i < end; ++i) {
1464
0
            int src_id = i % 4;
1465
0
            int src_offset = (i / 4) * blck_size_interleave;
1466
0
            int dst_offset = i * blck_size_interleave;
1467
1468
0
            uint32_t elems;
1469
0
            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
1470
0
            elems ^= xor_mask;
1471
0
            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
1472
0
        }
1473
0
    } else {
1474
0
        GGML_ASSERT(false);
1475
0
    }
1476
1477
0
    return out;
1478
0
}
1479
1480
// interleave 8 block_q4_0s in blocks of blck_size_interleave
1481
// returns an interleaved block_q4_0x8
1482
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
1483
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
1484
0
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
1485
0
    block_q4_0x8 out;
1486
1487
0
    for (int i = 0; i < 8; i++) {
1488
0
        out.d[i] = in[i].d;
1489
0
    }
1490
1491
0
    const int end = QK4_0 * 4 / blck_size_interleave;
1492
0
    const uint64_t xor_mask = 0x8888888888888888ULL;
1493
1494
0
    for (int i = 0; i < end; ++i) {
1495
0
        int src_id = i % 8;
1496
0
        int src_offset = (i / 8) * blck_size_interleave;
1497
0
        int dst_offset = i * blck_size_interleave;
1498
1499
0
        uint64_t elems;
1500
0
        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1501
0
        elems ^= xor_mask;
1502
0
        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1503
0
    }
1504
1505
0
    return out;
1506
0
}
1507
1508
0
static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
1509
0
    block_q4_Kx8 out;
1510
    //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
1511
0
    for (int i = 0; i < 8; i++) {
1512
0
        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1513
0
    }
1514
1515
0
    for (int i = 0; i < 8; i++) {
1516
0
        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1517
0
    }
1518
1519
0
    const int end = QK_K * 4 / blck_size_interleave;
1520
1521
    // Interleave Q4_K quants by taking 8 bytes at a time
1522
0
    for (int i = 0; i < end; ++i) {
1523
0
        int src_id = i % 8;
1524
0
        int src_offset = (i / 8) * blck_size_interleave;
1525
0
        int dst_offset = i * blck_size_interleave;
1526
1527
0
        uint64_t elems;
1528
0
        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1529
0
        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1530
0
    }
1531
1532
    // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
1533
    // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
1534
    // The output Q4_Kx8 structure has 96 bytes
1535
    // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
1536
    // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
1537
0
    uint8_t s[8], m[8];
1538
1539
0
    for (int i = 0; i < 4; i++) {
1540
0
        for (int j = 0; j < 8; j++) {
1541
0
            s[j] = in[j].scales[i] & 63;
1542
0
            m[j] = in[j].scales[i + 4] & 63;
1543
0
        }
1544
1545
0
        out.scales[i * 12]      = (s[0] & 63) + ((s[4] & 48) << 2);
1546
0
        out.scales[i * 12 + 1]  = (s[1] & 63) + ((s[5] & 48) << 2);
1547
0
        out.scales[i * 12 + 2]  = (s[2] & 63) + ((s[6] & 48) << 2);
1548
0
        out.scales[i * 12 + 3]  = (s[3] & 63) + ((s[7] & 48) << 2);
1549
0
        out.scales[i * 12 + 4]  = (m[0] & 63) + ((m[4] & 48) << 2);
1550
0
        out.scales[i * 12 + 5]  = (m[1] & 63) + ((m[5] & 48) << 2);
1551
0
        out.scales[i * 12 + 6]  = (m[2] & 63) + ((m[6] & 48) << 2);
1552
0
        out.scales[i * 12 + 7]  = (m[3] & 63) + ((m[7] & 48) << 2);
1553
0
        out.scales[i * 12 + 8]  = (s[4] & 15) + ((m[4] & 15) << 4);
1554
0
        out.scales[i * 12 + 9]  = (s[5] & 15) + ((m[5] & 15) << 4);
1555
0
        out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
1556
0
        out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
1557
1558
0
    }
1559
1560
0
    for (int i = 0; i < 4; i++) {
1561
0
        for (int j = 0; j < 8; j++) {
1562
0
            s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
1563
0
            m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
1564
0
        }
1565
1566
0
        out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
1567
0
        out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
1568
0
        out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
1569
0
        out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
1570
0
        out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
1571
0
        out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
1572
0
        out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
1573
0
        out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
1574
0
        out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
1575
0
        out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
1576
0
        out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
1577
0
        out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
1578
1579
0
    }
1580
1581
0
    return out;
1582
0
}
1583
1584
0
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1585
0
    block_q2_Kx8 out;
1586
1587
    // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1588
0
    for (int i = 0; i < 8; i++) {
1589
0
        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1590
0
    }
1591
1592
0
    for (int i = 0; i < 8; i++) {
1593
0
        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1594
0
    }
1595
1596
0
    const int end = QK_K * 2 / blck_size_interleave;
1597
1598
    // Interleave Q2_K quants by taking 8 bytes at a time
1599
0
    for (int i = 0; i < end; ++i) {
1600
0
        int src_id = i % 8;
1601
0
        int src_offset = (i / 8) * blck_size_interleave;
1602
0
        int dst_offset = i * blck_size_interleave;
1603
1604
0
        uint64_t elems;
1605
0
        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1606
0
        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1607
0
    }
1608
1609
    // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1610
    // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1611
    // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1612
    // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1613
    // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
1614
1615
0
    for(int i = 0; i < 128; i++){
1616
1617
        // Index for selecting which q2k super block
1618
0
        int src1 = (i % 16) / 2;
1619
        // Index for selecting scale
1620
0
        int src2 = ((i / 16) * 2) + (i % 2);
1621
1622
0
        out.scales[i] = in[src1].scales[src2];
1623
0
    }
1624
0
    return out;
1625
1626
0
}
1627
1628
0
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1629
0
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1630
0
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1631
0
    constexpr int nrows_interleaved = 4;
1632
1633
0
    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
1634
0
    const block_q4_0 * src = (const block_q4_0 *)data;
1635
0
    block_q4_0 dst_tmp[4];
1636
0
    int nrow = ggml_nrows(t);
1637
0
    int nblocks = t->ne[0] / QK4_0;
1638
1639
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
1640
1641
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1642
0
        return -1;
1643
0
    }
1644
1645
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1646
0
        for (int64_t x = 0; x < nblocks; x++) {
1647
0
            for (int i = 0; i < nrows_interleaved; i++) {
1648
0
                dst_tmp[i] = src[x + i * nblocks];
1649
0
            }
1650
0
            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
1651
0
        }
1652
0
        src += nrows_interleaved * nblocks;
1653
0
    }
1654
0
    return 0;
1655
1656
0
    GGML_UNUSED(data_size);
1657
0
}
1658
1659
0
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1660
0
    GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
1661
0
    GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
1662
0
    constexpr int nrows_interleaved = 8;
1663
1664
0
    block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
1665
0
    const block_q4_K * src = (const block_q4_K*) data;
1666
0
    block_q4_K dst_tmp[8];
1667
0
    int nrow = ggml_nrows(t);
1668
0
    int nblocks = t->ne[0] / QK_K;
1669
1670
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
1671
1672
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1673
0
        return -1;
1674
0
    }
1675
1676
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1677
0
        for (int64_t x = 0; x < nblocks; x++) {
1678
0
            for (int i  = 0; i < nrows_interleaved; i++ ) {
1679
0
                dst_tmp[i] = src[x + i * nblocks];
1680
0
            }
1681
0
            *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
1682
0
        }
1683
0
        src += nrows_interleaved * nblocks;
1684
0
    }
1685
0
    return 0;
1686
1687
0
    GGML_UNUSED(data_size);
1688
0
}
1689
1690
0
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1691
0
    GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1692
0
    GGML_ASSERT(interleave_block == 8);
1693
0
    constexpr int nrows_interleaved = 8;
1694
1695
0
    block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
1696
0
    const block_q2_K * src = (const block_q2_K*) data;
1697
0
    block_q2_K dst_tmp[8];
1698
0
    int nrow = ggml_nrows(t);
1699
0
    int nblocks = t->ne[0] / QK_K;
1700
1701
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
1702
1703
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1704
0
        return -1;
1705
0
    }
1706
1707
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1708
0
        for (int64_t x = 0; x < nblocks; x++) {
1709
0
            for (int i  = 0; i < nrows_interleaved; i++ ) {
1710
0
                dst_tmp[i] = src[x + i * nblocks];
1711
0
            }
1712
0
            *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
1713
0
        }
1714
0
        src += nrows_interleaved * nblocks;
1715
0
    }
1716
0
    return 0;
1717
1718
0
    GGML_UNUSED(data_size);
1719
0
}
1720
1721
0
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1722
0
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1723
0
    GGML_ASSERT(interleave_block == 8);
1724
0
    constexpr int nrows_interleaved = 8;
1725
1726
0
    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
1727
0
    const block_q4_0 * src = (const block_q4_0*) data;
1728
0
    block_q4_0 dst_tmp[8];
1729
0
    int nrow = ggml_nrows(t);
1730
0
    int nblocks = t->ne[0] / QK4_0;
1731
1732
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
1733
1734
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1735
0
        return -1;
1736
0
    }
1737
1738
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1739
0
        for (int64_t x = 0; x < nblocks; x++) {
1740
0
            for (int i  = 0; i < nrows_interleaved; i++ ) {
1741
0
                dst_tmp[i] = src[x + i * nblocks];
1742
0
            }
1743
0
            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
1744
0
        }
1745
0
        src += nrows_interleaved * nblocks;
1746
0
    }
1747
0
    return 0;
1748
1749
0
    GGML_UNUSED(data_size);
1750
0
}
1751
1752
static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor *       t,
1753
                                    int                        interleave_block,
1754
                                    const void * GGML_RESTRICT data,
1755
0
                                    size_t                     data_size) {
1756
0
    GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
1757
0
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1758
0
    constexpr int nrows_interleaved = 4;
1759
1760
0
    block_q8_0x4 *     dst = (block_q8_0x4 *) t->data;
1761
0
    const block_q8_0 * src = (const block_q8_0 *) data;
1762
0
    block_q8_0         dst_tmp[4];
1763
0
    int                nrow    = ggml_nrows(t);
1764
0
    int                nblocks = t->ne[0] / QK8_0;
1765
1766
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
1767
1768
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1769
0
        return -1;
1770
0
    }
1771
1772
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1773
0
        for (int64_t x = 0; x < nblocks; x++) {
1774
0
            for (int i = 0; i < nrows_interleaved; i++) {
1775
0
                dst_tmp[i] = src[x + i * nblocks];
1776
0
            }
1777
0
            *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
1778
0
        }
1779
0
        src += nrows_interleaved * nblocks;
1780
0
    }
1781
0
    return 0;
1782
0
}
1783
1784
0
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
1785
0
    block_iq4_nlx4 out;
1786
1787
0
    for (int i = 0; i < 4; i++) {
1788
0
        out.d[i] = in[i].d;
1789
0
    }
1790
1791
0
    const int end = QK4_NL * 2 / blck_size_interleave;
1792
1793
    // TODO: this branch seems wrong
1794
    //if (blck_size_interleave == 8) {
1795
    //    for (int i = 0; i < end; ++i) {
1796
    //        int src_id = i % 4;
1797
    //        int src_offset = (i / 4) * blck_size_interleave;
1798
    //        int dst_offset = i * blck_size_interleave;
1799
1800
    //        // Using memcpy to avoid unaligned memory accesses
1801
    //        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1802
    //    }
1803
    //} else
1804
0
    if (blck_size_interleave == 4) {
1805
0
        for (int i = 0; i < end; ++i) {
1806
0
            int src_id = i % 4;
1807
0
            int src_offset = (i / 4) * blck_size_interleave;
1808
0
            int dst_offset = i * blck_size_interleave;
1809
1810
0
            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
1811
0
        }
1812
0
    } else {
1813
0
        GGML_ASSERT(false);
1814
0
    }
1815
1816
0
    return out;
1817
0
}
1818
1819
0
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1820
0
    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1821
0
    GGML_ASSERT(interleave_block == 4);
1822
1823
0
    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
1824
0
          block_iq4_nlx4 * dst = (      block_iq4_nlx4 *)t->data;
1825
1826
0
    block_iq4_nl dst_tmp[4];
1827
1828
0
    int nrow = ggml_nrows(t);
1829
0
    int nrows_interleaved = 4;
1830
0
    int nblocks = t->ne[0] / QK4_NL;
1831
1832
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1833
1834
0
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1835
0
        return -1;
1836
0
    }
1837
1838
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1839
0
        for (int64_t x = 0; x < nblocks; x++) {
1840
0
            for (int i = 0; i < nrows_interleaved; i++) {
1841
0
                dst_tmp[i] = src[x + i * nblocks];
1842
0
            }
1843
0
            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
1844
0
        }
1845
0
        src += nrows_interleaved * nblocks;
1846
0
    }
1847
0
    return 0;
1848
1849
0
    GGML_UNUSED(data_size);
1850
0
}
1851
1852
0
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1853
0
    block_iq4_nlx8 out;
1854
1855
0
    for (int i = 0; i < 8; i++) {
1856
0
        out.d[i] = in[i].d;
1857
0
    }
1858
1859
0
    const int end = QK4_NL * 4 / blck_size_interleave;
1860
1861
0
    if (blck_size_interleave == 8) {
1862
0
        for (int i = 0; i < end; ++i) {
1863
0
            int src_id = i % 8;
1864
0
            int src_offset = (i / 8) * blck_size_interleave;
1865
0
            int dst_offset = i * blck_size_interleave;
1866
1867
0
            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1868
0
        }
1869
0
    } else {
1870
0
        GGML_ASSERT(false);
1871
0
    }
1872
1873
0
    return out;
1874
0
}
1875
1876
0
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1877
0
    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1878
0
    GGML_ASSERT(interleave_block == 8);
1879
1880
0
    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
1881
0
          block_iq4_nlx8 * dst = (      block_iq4_nlx8 *)t->data;
1882
1883
0
    block_iq4_nl dst_tmp[8];
1884
1885
0
    int nrow = ggml_nrows(t);
1886
0
    int nrows_interleaved = 8;
1887
0
    int nblocks = t->ne[0] / QK4_NL;
1888
1889
0
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1890
1891
0
    if (t->ne[1] % nrows_interleaved != 0) {
1892
0
        return -1;
1893
0
    }
1894
1895
0
    for (int b = 0; b < nrow; b += nrows_interleaved) {
1896
0
        for (int64_t x = 0; x < nblocks; x++) {
1897
0
            for (int i = 0; i < nrows_interleaved; i++) {
1898
0
                dst_tmp[i] = src[x + i * nblocks];
1899
0
            }
1900
0
            *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1901
0
        }
1902
0
        src += nrows_interleaved * nblocks;
1903
0
    }
1904
0
    return 0;
1905
1906
0
    GGML_UNUSED(data_size);
1907
0
}
1908
1909
namespace ggml::cpu::repack {
1910
// repack
1911
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
1912
int repack(struct ggml_tensor *, const void *, size_t);
1913
1914
// TODO: generalise.
1915
0
template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1916
0
    return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
1917
0
}
1918
1919
0
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1920
0
    return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
1921
0
}
1922
1923
0
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1924
0
    return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
1925
0
}
1926
1927
0
template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1928
0
    return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1929
0
}
1930
1931
0
template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1932
0
    return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
1933
0
}
1934
1935
0
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1936
0
    return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1937
0
}
1938
1939
0
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1940
0
    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1941
0
}
1942
1943
// TODO: needs to be revisited
1944
//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1945
//    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1946
//}
1947
1948
0
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1949
0
    return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1950
0
}
1951
1952
0
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1953
0
    return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
1954
0
}
1955
1956
0
template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1957
0
    return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
1958
0
}
1959
1960
// gemv
1961
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1962
void gemv(int, float *, size_t, const void *, const void *, int, int);
1963
1964
0
template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1965
0
    ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1966
0
}
1967
1968
0
template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1969
0
    ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1970
0
}
1971
1972
0
template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1973
0
    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1974
0
}
1975
1976
0
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1977
0
    ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
1978
0
}
1979
1980
0
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1981
0
    ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1982
0
}
1983
1984
0
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1985
0
    ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1986
0
}
1987
1988
0
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1989
0
    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1990
0
}
1991
1992
0
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1993
0
    ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1994
0
}
1995
1996
0
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1997
0
    ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1998
0
}
1999
2000
0
template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2001
0
    ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2002
0
}
2003
2004
// gemm
2005
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
2006
void gemm(int, float *, size_t, const void *, const void *, int, int);
2007
2008
0
template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2009
0
    ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2010
0
}
2011
2012
0
template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2013
0
    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2014
0
}
2015
2016
0
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2017
0
    ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
2018
0
}
2019
2020
0
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2021
0
    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2022
0
}
2023
2024
0
template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2025
0
    ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2026
0
}
2027
2028
0
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2029
0
    ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2030
0
}
2031
2032
0
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2033
0
    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2034
0
}
2035
2036
0
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2037
0
    ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2038
0
}
2039
2040
0
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2041
0
    ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2042
0
}
2043
2044
0
template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2045
0
    ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2046
0
}
2047
2048
class tensor_traits_base : public ggml::cpu::tensor_traits {
2049
  public:
2050
    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
2051
};
2052
2053
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
2054
2055
0
    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
2056
        // not realy a GGML_TYPE_Q8_0 but same size.
2057
0
        switch (op->op) {
2058
0
            case GGML_OP_MUL_MAT:
2059
0
                {
2060
0
                    size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
2061
0
                    return true;
2062
0
                }
2063
0
            case GGML_OP_MUL_MAT_ID:
2064
0
                {
2065
0
                    size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
2066
0
                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
2067
2068
0
                    const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
2069
0
                    const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
2070
2071
0
                    const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
2072
2073
0
                    size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
2074
2075
0
                    return true;
2076
0
                }
2077
0
            default:
2078
                // GGML_ABORT("fatal error");
2079
0
                break;
2080
0
        }
2081
0
        return false;
2082
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::work_size(int, ggml_tensor const*, unsigned long&)
2083
2084
0
    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
2085
0
        switch (op->op) {
2086
0
            case GGML_OP_MUL_MAT:
2087
0
                forward_mul_mat(params, op);
2088
0
                return true;
2089
0
            case GGML_OP_MUL_MAT_ID:
2090
0
                forward_mul_mat_id(params, op);
2091
0
                return true;
2092
0
            default:
2093
                // GGML_ABORT("fatal error");
2094
0
                break;
2095
0
        }
2096
0
        return false;
2097
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::compute_forward(ggml_compute_params*, ggml_tensor*)
2098
2099
    void forward_mul_mat_one_chunk(ggml_compute_params * params,
2100
                                   ggml_tensor *         op,
2101
                                   int64_t               src0_start,
2102
                                   int64_t               src0_end,
2103
                                   int64_t               src1_start,
2104
0
                                   int64_t               src1_end) {
2105
0
        const ggml_tensor * src0 = op->src[0];
2106
0
        const ggml_tensor * src1 = op->src[1];
2107
0
        ggml_tensor *       dst  = op;
2108
2109
0
        GGML_TENSOR_BINARY_OP_LOCALS
2110
2111
0
        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
2112
2113
0
        GGML_ASSERT(ne03 == 1 && ne13 == 1);
2114
0
        GGML_ASSERT(ne12 % ne02 == 0);
2115
0
        const int64_t r2 = ne12 / ne02;
2116
2117
0
        const int64_t i12 = src1_start / ne1;
2118
0
        const int64_t i11 = src1_start - i12 * ne1;
2119
2120
        // Determine batch index
2121
0
        const int64_t i02 = i12 / r2;
2122
2123
0
        const int64_t i1 = i11;
2124
0
        const int64_t i2 = i12;
2125
2126
0
        const char * src0_ptr = (const char *) src0->data + i02 * nb02;
2127
0
        const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
2128
0
        char *       dst_ptr  = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
2129
2130
0
        const int64_t nrows = src1_end - src1_start;
2131
0
        const int64_t ncols = src0_end - src0_start;
2132
2133
0
        GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
2134
2135
        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
2136
0
        if (nrows > 3) {
2137
0
            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
2138
0
                                                             src0_ptr + src0_start * nb01, src1_ptr,
2139
0
                                                             nrows - (nrows % 4), ncols);
2140
0
        }
2141
0
        for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
2142
0
            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
2143
0
                                                             ne01, src0_ptr + src0_start * nb01,
2144
0
                                                             src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
2145
0
        }
2146
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_one_chunk(ggml_compute_params*, ggml_tensor*, long, long, long, long)
2147
2148
0
    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
2149
0
        const ggml_tensor * src0 = op->src[0];
2150
0
        const ggml_tensor * src1 = op->src[1];
2151
0
        ggml_tensor *       dst  = op;
2152
2153
0
        GGML_TENSOR_BINARY_OP_LOCALS
2154
2155
0
        const int ith = params->ith;
2156
0
        const int nth = params->nth;
2157
2158
0
        GGML_ASSERT(ne0 == ne01);
2159
0
        GGML_ASSERT(ne1 == ne11);
2160
0
        GGML_ASSERT(ne2 == ne12);
2161
0
        GGML_ASSERT(ne3 == ne13);
2162
2163
        // dst cannot be transposed or permuted
2164
0
        GGML_ASSERT(nb0 == sizeof(float));
2165
0
        GGML_ASSERT(nb0 <= nb1);
2166
0
        GGML_ASSERT(nb1 <= nb2);
2167
0
        GGML_ASSERT(nb2 <= nb3);
2168
2169
        // TODO: General batched mul mat for 4D tensors
2170
        // Currently only supports 3D tensors
2171
0
        GGML_ASSERT(ne03 == 1);
2172
0
        GGML_ASSERT(ne13 == 1);
2173
0
        GGML_ASSERT(ne3 == 1);
2174
2175
0
        GGML_ASSERT(src1->type == GGML_TYPE_F32);
2176
2177
0
        GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
2178
        // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
2179
2180
0
        char *       wdata = static_cast<char *>(params->wdata);
2181
0
        const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
2182
0
        const size_t nbw2  = nbw1 * ne11;
2183
2184
0
        assert(params->wsize >= nbw2 * ne12);
2185
2186
0
        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
2187
2188
        // INFO: Quantization is done in planes to avoid extra complexity in chunking.
2189
        // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
2190
        // the planes are broadcast.
2191
0
        for (int64_t i12 = 0; i12 < ne12; i12++) {
2192
0
            char * data_ptr  = (char *) src1->data + i12 * nb12;
2193
0
            char * wdata_ptr = wdata + i12 * nbw2;
2194
2195
0
            for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
2196
0
                ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
2197
0
                                                            (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
2198
0
            }
2199
2200
0
            const int64_t i11_processed = ne11 - ne11 % 4;
2201
0
            for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
2202
0
                from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
2203
0
            }
2204
0
        }
2205
2206
        // disable for NUMA
2207
0
        const bool disable_chunking = ggml_is_numa();
2208
2209
        // 4x chunks per thread
2210
0
        const int64_t nr0 = ggml_nrows(op->src[0]);
2211
2212
0
        int     nth_scaled  = nth * 4;
2213
0
        int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
2214
0
        int64_t nchunk0     = (nr0 + chunk_size0 - 1) / chunk_size0;
2215
2216
        // src1 is chunked only by full planes.
2217
        // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
2218
        // to route them thorugh GEMV.
2219
        // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
2220
        // to avoid affecting their performance
2221
0
        int64_t nchunk1 = ne12;
2222
2223
        // Ensure minimum chunk size to avoid alignment issues with high thread counts
2224
        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
2225
0
        const int64_t min_chunk_size = NB_COLS;
2226
0
        if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
2227
0
            nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
2228
0
        }
2229
2230
0
        int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
2231
        // Only increase nchunk0 to nth if it won't make chunks too small
2232
0
        if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
2233
0
            nchunk0 = nth;
2234
0
            dr0 = (nr0 + nchunk0 - 1) / nchunk0;
2235
0
        }
2236
2237
        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
2238
        // This prevents creating too many tiny chunks that could overlap after alignment
2239
0
        const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
2240
0
        nchunk0                  = MIN(nchunk0, max_nchunk);
2241
2242
0
        if (ith == 0) {
2243
            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
2244
0
            ggml_threadpool_chunk_set(params->threadpool, nth);
2245
0
        }
2246
2247
0
        ggml_barrier(params->threadpool);
2248
2249
        // The first chunk comes from our thread_id, the rest will get auto-assigned.
2250
0
        int current_chunk = ith;
2251
2252
0
        while (current_chunk < nchunk0 * nchunk1) {
2253
0
            const int64_t ith0 = current_chunk % nchunk0;
2254
0
            const int64_t ith1 = current_chunk / nchunk0;
2255
2256
0
            int64_t src0_start = dr0 * ith0;
2257
0
            int64_t src0_end   = MIN(src0_start + dr0, nr0);
2258
2259
            // full-plane range for src1
2260
0
            int64_t src1_start = ith1 * ne11;
2261
0
            int64_t src1_end = (ith1 + 1) * ne11;
2262
2263
            // Align boundaries to NB_COLS - round up to ensure all data is included
2264
            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
2265
0
            src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
2266
0
            src0_end   = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
2267
0
            src0_end   = MIN(src0_end, ne01);
2268
2269
            // Make sure current plane is the last one before exiting
2270
0
            if (src0_start >= src0_end) {
2271
0
                current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
2272
0
                continue;
2273
0
            }
2274
2275
0
            forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
2276
2277
0
            current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
2278
0
        }
2279
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat(ggml_compute_params*, ggml_tensor*)
2280
2281
0
    void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
2282
0
        const ggml_tensor * src0 = op->src[0];
2283
0
        const ggml_tensor * src1 = op->src[1];
2284
0
        const ggml_tensor * ids  = op->src[2];
2285
0
        ggml_tensor *       dst  = op;
2286
2287
0
        GGML_TENSOR_BINARY_OP_LOCALS
2288
2289
0
        const int ith = params->ith;
2290
0
        const int nth = params->nth;
2291
2292
0
        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
2293
2294
        // we don't support permuted src0 or src1
2295
0
        GGML_ASSERT(nb00 == ggml_type_size(src0->type));
2296
0
        GGML_ASSERT(nb10 == ggml_type_size(src1->type));
2297
2298
        // dst cannot be transposed or permuted
2299
0
        GGML_ASSERT(nb0 == sizeof(float));
2300
0
        GGML_ASSERT(nb0 <= nb1);
2301
0
        GGML_ASSERT(nb1 <= nb2);
2302
0
        GGML_ASSERT(nb2 <= nb3);
2303
2304
0
        GGML_ASSERT(ne03 == 1);
2305
0
        GGML_ASSERT(ne13 == 1);
2306
0
        GGML_ASSERT(ne3  == 1);
2307
2308
0
        GGML_ASSERT(src1->type == GGML_TYPE_F32);
2309
2310
        // row groups
2311
0
        const int n_ids = ids->ne[0]; // n_expert_used
2312
0
        const int n_as  = ne02;       // n_expert
2313
2314
0
        const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
2315
0
        const size_t nbw2 = nbw1*ne11;
2316
0
        const size_t nbw3 = nbw2*ne12;
2317
2318
0
        struct mmid_row_mapping {
2319
0
            int32_t i1;
2320
0
            int32_t i2;
2321
0
        };
2322
2323
0
        GGML_ASSERT(params->wsize >=
2324
0
                (GGML_PAD(nbw3, sizeof(int64_t)) +
2325
0
                 n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
2326
0
                );
2327
2328
0
        auto * wdata          = (char *)params->wdata;
2329
0
        auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
2330
2331
        // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
2332
0
        auto * matrix_row_counts = (int64_t *) (wdata_src1_end);                                        // [n_as]
2333
0
        struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
2334
2335
        // src1: float32 => param type
2336
0
        for (int64_t i12 = 0; i12 < ne12; ++i12) {
2337
0
            for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
2338
0
                from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
2339
0
                           (void *)               (wdata + i12 * nbw2 + i11 * nbw1),
2340
0
                           ne10);
2341
0
            }
2342
0
        }
2343
2344
0
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
2345
2346
0
        if (ith == 0) {
2347
            // initialize matrix_row_counts
2348
0
            memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
2349
2350
            // group rows by src0 matrix
2351
0
            for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
2352
0
                for (int32_t id = 0; id < n_ids; ++id) {
2353
0
                    const int32_t i02 =
2354
0
                        *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
2355
2356
0
                    GGML_ASSERT(i02 >= 0 && i02 < n_as);
2357
2358
0
                    MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
2359
0
                    matrix_row_counts[i02] += 1;
2360
0
                }
2361
0
            }
2362
0
        }
2363
2364
0
        ggml_barrier(params->threadpool);
2365
2366
        // compute each matrix multiplication in sequence
2367
0
        for (int cur_a = 0; cur_a < n_as; ++cur_a) {
2368
0
            const int64_t cne1 = matrix_row_counts[cur_a];
2369
2370
0
            if (cne1 == 0) {
2371
0
                continue;
2372
0
            }
2373
2374
0
            const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
2375
2376
            //const int64_t nr0 = ne01; // src0 rows
2377
0
            const int64_t nr1 = cne1; // src1 rows
2378
2379
0
            int64_t src0_cur_start = (ith * ne01) / nth;
2380
0
            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
2381
2382
            // Align boundaries to NB_COLS - round up to ensure all data is included
2383
0
            src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
2384
0
            src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
2385
0
            if (src0_cur_end > ne01) {
2386
0
                src0_cur_end = ne01;
2387
0
            }
2388
2389
0
            if (src0_cur_start >= src0_cur_end) {
2390
0
                return;
2391
0
            }
2392
2393
0
            for (int ir1 = 0; ir1 < nr1; ir1++) {
2394
0
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
2395
2396
0
                const int id = row_mapping.i1; // selected expert index
2397
2398
0
                const int64_t i11 = id % ne11;
2399
0
                const int64_t i12 = row_mapping.i2; // row index in src1
2400
2401
0
                const int64_t i1 = id;  // selected expert index
2402
0
                const int64_t i2 = i12; // row
2403
2404
0
                const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
2405
2406
0
                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
2407
0
                        (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
2408
0
                        src0_cur + src0_cur_start * nb01,
2409
0
                        src1_col, 1, src0_cur_end - src0_cur_start);
2410
0
            }
2411
0
        }
2412
0
#undef MMID_MATRIX_ROW
2413
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::forward_mul_mat_id(ggml_compute_params*, ggml_tensor*)
2414
2415
0
    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
2416
0
        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
2417
0
                       (int) NB_COLS, (int) INTER_SIZE);
2418
0
        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
2419
0
    }
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_0, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 4l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q4_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q2_K, 8l, 8l, (ggml_type)15>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_iq4_nl, 8l, 8l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 4l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
Unexecuted instantiation: ggml::cpu::repack::tensor_traits<block_q8_0, 8l, 4l, (ggml_type)8>::repack(ggml_tensor*, void const*, unsigned long)
2420
};
2421
2422
}  // namespace ggml::cpu::repack
2423
2424
0
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
2425
2426
    // instance for Q4
2427
0
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
2428
0
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
2429
0
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
2430
2431
    // instance for Q4_K
2432
0
    static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
2433
0
    static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
2434
2435
    // instance for Q2
2436
0
    static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
2437
2438
    // instance for IQ4
2439
0
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
2440
0
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2441
2442
    // instance for Q8_0
2443
0
    static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
2444
0
    static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2445
2446
0
    if (cur->type == GGML_TYPE_Q4_0) {
2447
0
        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2448
0
            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
2449
0
            if (cur->ne[1] % 8 == 0) {
2450
0
                return &q4_0_8x8_q8_0;
2451
0
            }
2452
0
        }
2453
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2454
0
            if (cur->ne[1] % 4 == 0) {
2455
0
                return &q4_0_4x8_q8_0;
2456
0
            }
2457
0
        }
2458
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2459
0
            if (cur->ne[1] % 4 == 0) {
2460
0
                return &q4_0_4x4_q8_0;
2461
0
            }
2462
0
        }
2463
0
    } else if (cur->type == GGML_TYPE_Q4_K) {
2464
0
        if (ggml_cpu_has_avx2()) {
2465
0
            if (cur->ne[1] % 8 == 0) {
2466
0
                return &q4_K_8x8_q8_K;
2467
0
            }
2468
0
        }
2469
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2470
0
            if (cur->ne[1] % 8 == 0) {
2471
0
                return &q4_K_8x8_q8_K;
2472
0
            }
2473
0
        }
2474
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2475
0
            if (cur->ne[1] % 8 == 0) {
2476
0
                return &q4_K_8x4_q8_K;
2477
0
            }
2478
0
        }
2479
0
    } else if (cur->type == GGML_TYPE_Q2_K) {
2480
0
        if (ggml_cpu_has_avx512()) {
2481
0
            if (cur->ne[1] % 8 == 0) {
2482
0
                return &q2_K_8x8_q8_K;
2483
0
            }
2484
0
        }
2485
0
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
2486
0
        if (ggml_cpu_has_avx2()) {
2487
0
            if (cur->ne[1] % 8 == 0) {
2488
0
                return &iq4_nl_8x8_q8_0;
2489
0
            }
2490
0
        }
2491
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2492
0
            if (cur->ne[1] % 4 == 0) {
2493
0
                return &iq4_nl_4x4_q8_0;
2494
0
            }
2495
0
        }
2496
0
    } else if (cur->type == GGML_TYPE_Q8_0) {
2497
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2498
0
            if (cur->ne[1] % 4 == 0) {
2499
0
                return &q8_0_4x8_q8_0;
2500
0
            }
2501
0
        }
2502
0
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2503
0
            if (cur->ne[1] % 4 == 0) {
2504
0
                return &q8_0_4x4_q8_0;
2505
0
            }
2506
0
        }
2507
0
    }
2508
2509
0
    return nullptr;
2510
0
}
2511
2512
0
static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2513
0
    tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
2514
2515
0
    GGML_UNUSED(buffer);
2516
0
    return GGML_STATUS_SUCCESS;
2517
0
}
2518
2519
static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
2520
0
                                                       const void * data, size_t offset, size_t size) {
2521
0
    GGML_ASSERT(offset == 0);
2522
0
    GGML_ASSERT(size == ggml_nbytes(tensor));
2523
2524
0
    auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
2525
0
    auto OK            = tensor_traits->repack(tensor, data, size);
2526
2527
0
    GGML_ASSERT(OK == 0);
2528
0
    GGML_UNUSED(buffer);
2529
0
}
2530
2531
0
static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2532
0
    return "CPU_REPACK";
2533
2534
0
    GGML_UNUSED(buft);
2535
0
}
2536
2537
0
static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2538
0
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2539
2540
0
    if (buffer == nullptr) {
2541
0
        return nullptr;
2542
0
    }
2543
2544
0
    buffer->buft              = buft;
2545
0
    buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
2546
0
    buffer->iface.set_tensor  = ggml_backend_cpu_repack_buffer_set_tensor;
2547
0
    buffer->iface.get_tensor  = nullptr;
2548
0
    buffer->iface.cpy_tensor  = nullptr;
2549
0
    return buffer;
2550
0
}
2551
2552
0
static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2553
0
    return TENSOR_ALIGNMENT;
2554
2555
0
    GGML_UNUSED(buft);
2556
0
}
2557
2558
namespace ggml::cpu::repack {
2559
class extra_buffer_type : ggml::cpu::extra_buffer_type {
2560
0
    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
2561
0
        if (    op->op == GGML_OP_MUL_MAT &&
2562
0
                op->src[0]->buffer &&
2563
0
                (ggml_n_dims(op->src[0]) == 2) &&
2564
0
                op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
2565
0
                ggml_repack_get_optimal_repack_type(op->src[0])
2566
0
                ) {
2567
0
            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
2568
0
                return false;
2569
0
            }
2570
0
            if (op->src[1]->type == GGML_TYPE_F32) {
2571
0
                return true;
2572
0
            }
2573
            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
2574
            //    return true;
2575
            //}
2576
            // may be possible if Q8_0 packed...
2577
0
        } else if (op->op == GGML_OP_MUL_MAT_ID
2578
0
                && op->src[0]->buffer
2579
0
                && (ggml_n_dims(op->src[0]) == 3)
2580
0
                && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
2581
0
                && ggml_repack_get_optimal_repack_type(op->src[0])
2582
0
                ) {
2583
0
            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
2584
0
                return false;
2585
0
            }
2586
0
            if (op->src[1]->type == GGML_TYPE_F32) {
2587
0
                return true;
2588
0
            }
2589
            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
2590
            //    return true;
2591
            //}
2592
0
        }
2593
0
        return false;
2594
0
    }
2595
2596
0
    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
2597
0
        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
2598
0
            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
2599
0
                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
2600
0
            }
2601
0
        }
2602
0
        return nullptr;
2603
0
    }
2604
};
2605
}  // namespace ggml::cpu::repack
2606
2607
0
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
2608
0
    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
2609
0
        /* .iface    = */ {
2610
0
                           /* .get_name         = */ ggml_backend_cpu_repack_buffer_type_get_name,
2611
0
                           /* .alloc_buffer     = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
2612
0
                           /* .get_alignment    = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
2613
0
                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
2614
0
                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
2615
0
                           /* .is_host          = */ nullptr,
2616
0
                           },
2617
0
        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2618
0
        /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
2619
0
    };
2620
2621
0
    return &ggml_backend_cpu_buffer_type_repack;
2622
0
}