Coverage Report

Created: 2026-06-13 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/quants.c
Line
Count
Source
1
#define GGML_COMMON_IMPL_C
2
#include "ggml-common.h"
3
4
#include "ggml-cpu-impl.h"
5
#include "simd-mappings.h"
6
#include "ggml-quants.h"
7
#include "quants.h"
8
9
#include "arch-fallback.h"
10
11
#include <string.h>
12
#include <assert.h>
13
#include <float.h>
14
#include <stdlib.h> // for qsort
15
#include <stdio.h>  // for GGML_ASSERT
16
17
#define GROUP_MAX_EPS 1e-15f
18
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
#define GROUP_MAX_EPS_IQ2_S 1e-8f
20
#define GROUP_MAX_EPS_IQ1_M 1e-7f
21
#define GROUP_MAX_EPS_IQ1_S 1e-12f
22
23
0
#define UNUSED GGML_UNUSED
24
25
0
void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
26
0
    quantize_row_q1_0_ref(x, y, k);
27
0
}
28
29
0
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
30
0
    quantize_row_q4_0_ref(x, y, k);
31
0
}
32
33
0
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
34
0
    quantize_row_q4_1_ref(x, y, k);
35
0
}
36
37
0
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
38
0
    quantize_row_q5_0_ref(x, y, k);
39
0
}
40
41
0
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
42
0
    quantize_row_q5_1_ref(x, y, k);
43
0
}
44
45
0
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
46
0
    quantize_row_q8_0_ref(x, y, k);
47
0
}
48
49
0
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
50
0
    quantize_row_q8_1_ref(x, y, k);
51
0
}
52
53
0
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
54
0
    quantize_row_mxfp4_ref(x, y, k);
55
0
}
56
57
0
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
58
0
    quantize_row_nvfp4_ref(x, y, k);
59
0
}
60
61
//
62
// 2-6 bit quantization in super-blocks
63
//
64
65
//========================- 2-bit (de)-quantization
66
67
0
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
68
0
    quantize_row_q2_K_ref(x, vy, k);
69
0
}
70
71
//========================= 3-bit (de)-quantization
72
73
0
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
74
0
    quantize_row_q3_K_ref(x, vy, k);
75
0
}
76
77
// ====================== 4-bit (de)-quantization
78
79
0
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
80
0
    assert(k % QK_K == 0);
81
0
    block_q4_K * GGML_RESTRICT y = vy;
82
0
    quantize_row_q4_K_ref(x, y, k);
83
0
}
84
85
// ====================== 5-bit (de)-quantization
86
87
0
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
88
0
    assert(k % QK_K == 0);
89
0
    block_q5_K * GGML_RESTRICT y = vy;
90
0
    quantize_row_q5_K_ref(x, y, k);
91
0
}
92
93
// ====================== 6-bit (de)-quantization
94
95
0
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
96
0
    assert(k % QK_K == 0);
97
0
    block_q6_K * GGML_RESTRICT y = vy;
98
0
    quantize_row_q6_K_ref(x, y, k);
99
0
}
100
101
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
102
103
0
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
104
0
    assert(k % QK_K == 0);
105
0
    block_tq1_0 * GGML_RESTRICT y = vy;
106
0
    quantize_row_tq1_0_ref(x, y, k);
107
0
}
108
109
0
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
110
0
    assert(k % QK_K == 0);
111
0
    block_tq2_0 * GGML_RESTRICT y = vy;
112
0
    quantize_row_tq2_0_ref(x, y, k);
113
0
}
114
115
//===================================== Q8_K ==============================================
116
117
0
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
118
0
    quantize_row_q8_K_ref(x, y, k);
119
0
}
120
121
//===================================== Dot products =================================
122
123
0
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
124
0
    const int qk = QK1_0;
125
0
    const int nb = n / qk;
126
127
0
    assert(n % qk == 0);
128
0
    assert(nrc == 1);
129
0
    UNUSED(nrc);
130
0
    UNUSED(bx);
131
0
    UNUSED(by);
132
0
    UNUSED(bs);
133
134
0
    const block_q1_0 * GGML_RESTRICT x = vx;
135
0
    const block_q8_0 * GGML_RESTRICT y = vy;
136
137
0
    float sumf = 0.0;
138
139
0
    for (int i = 0; i < nb; i++) {
140
0
        const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
141
142
0
        float sumi = 0.0f;
143
144
0
        for (int k = 0; k < 4; k++) {
145
0
            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
146
0
            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
147
0
            int sumi_block = 0;
148
149
0
            const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
150
0
            const int8_t  * GGML_RESTRICT qy   = yb->qs;
151
152
0
            for (int b = 0; b < 4; ++b, qy += 8) {
153
0
                const unsigned mask = bits[b];
154
0
                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
155
0
                           +  ((mask & 0x02) ? qy[1] : -qy[1])
156
0
                           +  ((mask & 0x04) ? qy[2] : -qy[2])
157
0
                           +  ((mask & 0x08) ? qy[3] : -qy[3])
158
0
                           +  ((mask & 0x10) ? qy[4] : -qy[4])
159
0
                           +  ((mask & 0x20) ? qy[5] : -qy[5])
160
0
                           +  ((mask & 0x40) ? qy[6] : -qy[6])
161
0
                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
162
0
            }
163
164
0
            sumi += d1 * sumi_block;
165
0
        }
166
167
0
        sumf += d0 * sumi;
168
0
    }
169
170
0
    *s = sumf;
171
0
}
172
173
174
0
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
175
0
    const int qk = QK8_0;
176
0
    const int nb = n / qk;
177
178
0
    assert(n % qk == 0);
179
0
    assert(nrc == 1);
180
0
    UNUSED(nrc);
181
0
    UNUSED(bx);
182
0
    UNUSED(by);
183
0
    UNUSED(bs);
184
185
0
    const block_q4_0 * GGML_RESTRICT x = vx;
186
0
    const block_q8_0 * GGML_RESTRICT y = vy;
187
188
0
    int ib = 0;
189
0
    float sumf = 0;
190
191
0
    for (; ib < nb; ++ib) {
192
0
        int sumi0 = 0;
193
0
        int sumi1 = 0;
194
195
0
        for (int j = 0; j < qk/2; ++j) {
196
0
            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
197
0
            const int v1 = (x[ib].qs[j] >>   4) - 8;
198
199
0
            sumi0 += (v0 * y[ib].qs[j]);
200
0
            sumi1 += (v1 * y[ib].qs[j + qk/2]);
201
0
        }
202
203
0
        int sumi = sumi0 + sumi1;
204
0
        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
205
0
    }
206
207
0
    *s = sumf;
208
0
}
209
210
// TODO: add WASM SIMD
211
0
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
212
0
    const int qk = QK8_1;
213
0
    const int nb = n / qk;
214
215
0
    assert(n % qk == 0);
216
0
    assert(nrc == 1);
217
0
    UNUSED(nrc);
218
0
    UNUSED(bx);
219
0
    UNUSED(by);
220
0
    UNUSED(bs);
221
222
0
    const block_q4_1 * GGML_RESTRICT x = vx;
223
0
    const block_q8_1 * GGML_RESTRICT y = vy;
224
225
0
    int ib = 0;
226
0
    float sumf = 0;
227
228
0
    for (; ib < nb; ++ib) {
229
0
        int sumi0 = 0;
230
0
        int sumi1 = 0;
231
232
0
        for (int j = 0; j < qk/2; ++j) {
233
0
            const int v0 = (x[ib].qs[j] & 0x0F);
234
0
            const int v1 = (x[ib].qs[j] >>   4);
235
236
0
            sumi0 += (v0 * y[ib].qs[j]);
237
0
            sumi1 += (v1 * y[ib].qs[j + qk/2]);
238
0
        }
239
240
0
        int sumi = sumi0 + sumi1;
241
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
242
0
    }
243
244
0
    *s = sumf;
245
0
}
246
247
0
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
248
0
    assert(nrc == 1);
249
0
    UNUSED(nrc);
250
0
    UNUSED(bx);
251
0
    UNUSED(by);
252
0
    UNUSED(bs);
253
0
    assert(n % QK_MXFP4 == 0);
254
0
    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
255
256
0
    const block_mxfp4 * GGML_RESTRICT x = vx;
257
0
    const block_q8_0 * GGML_RESTRICT y = vy;
258
259
0
    const int nb = n / QK_MXFP4;
260
261
0
    int ib = 0;
262
0
    float sumf = 0;
263
264
0
    for (; ib < nb; ++ib) {
265
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
266
267
0
        int sumi1 = 0;
268
0
        int sumi2 = 0;
269
0
        for (int j = 0; j < QK_MXFP4/2; ++j) {
270
0
            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
271
0
            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
272
0
        }
273
0
        sumf += d * (sumi1 + sumi2);
274
0
    }
275
0
    *s = sumf;
276
0
}
277
278
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
279
0
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
280
0
    assert(nrc == 1);
281
0
    UNUSED(nrc);
282
0
    UNUSED(bx);
283
0
    UNUSED(by);
284
0
    UNUSED(bs);
285
0
    assert(n % QK_NVFP4 == 0);
286
287
0
    const block_nvfp4 * GGML_RESTRICT x = vx;
288
0
    const block_q8_0 * GGML_RESTRICT y = vy;
289
290
0
    const int nb = n / QK_NVFP4;
291
292
0
    float sumf = 0;
293
294
0
    for (int ib = 0; ib < nb; ++ib) {
295
0
        for (int s_idx = 0; s_idx < 4; ++s_idx) {
296
0
            const float d = ggml_ue4m3_to_fp32(x[ib].d[s_idx]);
297
0
            const int q8_block = s_idx / 2;
298
0
            const int q8_off   = (s_idx % 2) * QK_NVFP4_SUB;
299
0
            const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
300
301
0
            int sumi_lo = 0, sumi_hi = 0;
302
0
            for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
303
0
                const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
304
0
                sumi_lo += y[2*ib + q8_block].qs[q8_off + j +               0] * kvalues_mxfp4[qv & 0xf];
305
0
                sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >>  4];
306
0
            }
307
308
0
            sumf += dy * d * (sumi_lo + sumi_hi);
309
0
        }
310
0
    }
311
0
    *s = sumf;
312
0
}
313
314
0
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
315
0
    const int qk = QK8_0;
316
0
    const int nb = n / qk;
317
318
0
    int ib = 0;
319
0
    float sumf = 0;
320
321
0
    assert(n % qk == 0);
322
0
    assert(qk == QK5_0);
323
0
    assert(nrc == 1);
324
0
    UNUSED(nrc);
325
0
    UNUSED(bx);
326
0
    UNUSED(by);
327
0
    UNUSED(bs);
328
329
0
    const block_q5_0 * GGML_RESTRICT x = vx;
330
0
    const block_q8_0 * GGML_RESTRICT y = vy;
331
332
0
    for (; ib < nb; ++ib) {
333
0
        uint32_t qh;
334
0
        memcpy(&qh, x[ib].qh, sizeof(qh));
335
336
0
        int sumi0 = 0;
337
0
        int sumi1 = 0;
338
339
0
        for (int j = 0; j < qk/2; ++j) {
340
0
            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
341
0
            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
342
343
0
            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
344
0
            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
345
346
0
            sumi0 += (x0 * y[ib].qs[j]);
347
0
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
348
0
        }
349
350
0
        int sumi = sumi0 + sumi1;
351
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
352
0
    }
353
354
0
    *s = sumf;
355
0
}
356
357
0
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
358
0
    const int qk = QK8_1;
359
0
    const int nb = n / qk;
360
361
0
    int ib = 0;
362
0
    float sumf = 0;
363
364
0
    assert(n % qk == 0);
365
0
    assert(qk == QK5_1);
366
0
    assert(nrc == 1);
367
0
    UNUSED(nrc);
368
0
    UNUSED(bx);
369
0
    UNUSED(by);
370
0
    UNUSED(bs);
371
372
0
    const block_q5_1 * GGML_RESTRICT x = vx;
373
0
    const block_q8_1 * GGML_RESTRICT y = vy;
374
375
0
    for (; ib < nb; ++ib) {
376
0
        uint32_t qh;
377
0
        memcpy(&qh, x[ib].qh, sizeof(qh));
378
379
0
        int sumi0 = 0;
380
0
        int sumi1 = 0;
381
382
0
        for (int j = 0; j < qk/2; ++j) {
383
0
            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
384
0
            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
385
386
0
            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
387
0
            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
388
389
0
            sumi0 += (x0 * y[ib].qs[j]);
390
0
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
391
0
        }
392
393
0
        int sumi = sumi0 + sumi1;
394
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
395
0
    }
396
397
0
    *s = sumf;
398
0
}
399
400
0
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
401
0
    const int qk = QK8_0;
402
0
    const int nb = n / qk;
403
404
0
    assert(n % qk == 0);
405
0
    assert(nrc == 1);
406
0
    UNUSED(nrc);
407
0
    UNUSED(bx);
408
0
    UNUSED(by);
409
0
    UNUSED(bs);
410
411
0
    const block_q8_0 * GGML_RESTRICT x = vx;
412
0
    const block_q8_0 * GGML_RESTRICT y = vy;
413
414
0
    int ib = 0;
415
0
    float sumf = 0;
416
417
0
    for (; ib < nb; ++ib) {
418
0
        int sumi = 0;
419
420
0
        for (int j = 0; j < qk; j++) {
421
0
            sumi += x[ib].qs[j]*y[ib].qs[j];
422
0
        }
423
424
0
        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
425
0
    }
426
427
0
    *s = sumf;
428
0
}
429
430
0
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
431
0
    assert(nrc == 1);
432
0
    UNUSED(nrc);
433
0
    UNUSED(bx);
434
0
    UNUSED(by);
435
0
    UNUSED(bs);
436
437
0
    const block_tq1_0 * GGML_RESTRICT x = vx;
438
0
    const block_q8_K  * GGML_RESTRICT y = vy;
439
440
0
    const int nb = n / QK_K;
441
442
0
    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
443
444
0
    float sumf = 0.0f;
445
446
0
    for (int i = 0; i < nb; ++i) {
447
0
        int sum = 0;
448
449
0
        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
450
0
            for (size_t l = 0; l < 5; ++l) {
451
0
                for (size_t m = 0; m < 32; ++m) {
452
0
                    uint8_t q = x[i].qs[j + m] * pow3[l];
453
0
                    uint16_t xi = ((uint16_t) q * 3) >> 8;
454
0
                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
455
0
                }
456
0
            }
457
0
        }
458
0
        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
459
0
            for (size_t l = 0; l < 5; ++l) {
460
0
                for (size_t m = 0; m < 16; ++m) {
461
0
                    uint8_t q = x[i].qs[j + m] * pow3[l];
462
0
                    uint16_t xi = ((uint16_t) q * 3) >> 8;
463
0
                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
464
0
                }
465
0
            }
466
0
        }
467
468
0
        for (size_t l = 0; l < 4; ++l) {
469
0
            for (size_t j = 0; j < sizeof(x->qh); ++j) {
470
0
                uint8_t q = x[i].qh[j] * pow3[l];
471
0
                uint16_t xi = ((uint16_t) q * 3) >> 8;
472
0
                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
473
0
            }
474
0
        }
475
476
0
        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
477
0
    }
478
479
0
    *s = sumf;
480
0
}
481
482
0
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
483
0
    assert(nrc == 1);
484
0
    UNUSED(nrc);
485
0
    UNUSED(bx);
486
0
    UNUSED(by);
487
0
    UNUSED(bs);
488
489
0
    const block_tq2_0 * GGML_RESTRICT x = vx;
490
0
    const block_q8_K  * GGML_RESTRICT y = vy;
491
492
0
    const int nb = n / QK_K;
493
0
    float sumf = 0.0f;
494
495
0
    for (int i = 0; i < nb; ++i) {
496
0
        int32_t sumi = 0;
497
498
0
        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
499
0
            for (size_t l = 0; l < 4; ++l) {
500
0
                for (size_t k = 0; k < 32; ++k) {
501
0
                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
502
0
                }
503
0
            }
504
0
        }
505
506
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
507
508
0
        sumf += (float) sumi * d;
509
0
    }
510
511
0
    *s = sumf;
512
0
}
513
514
0
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
515
0
    assert(nrc == 1);
516
0
    UNUSED(nrc);
517
0
    UNUSED(bx);
518
0
    UNUSED(by);
519
0
    UNUSED(bs);
520
521
0
    const block_q2_K * GGML_RESTRICT x = vx;
522
0
    const block_q8_K * GGML_RESTRICT y = vy;
523
524
0
    const int nb = n / QK_K;
525
526
0
    float sumf = 0;
527
528
0
    for (int i = 0; i < nb; ++i) {
529
530
0
        const uint8_t * q2 = x[i].qs;
531
0
        const  int8_t * q8 = y[i].qs;
532
0
        const uint8_t * sc = x[i].scales;
533
534
0
        int summs = 0;
535
0
        for (int j = 0; j < 16; ++j) {
536
0
            summs += y[i].bsums[j] * (sc[j] >> 4);
537
0
        }
538
539
0
        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
540
0
        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
541
542
0
        int isum = 0;
543
0
        int is = 0;
544
0
        int d;
545
0
        for (int k = 0; k < QK_K/128; ++k) {
546
0
            int shift = 0;
547
0
            for (int j = 0; j < 4; ++j) {
548
0
                d = sc[is++] & 0xF;
549
0
                int isuml = 0;
550
0
                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
551
0
                isum += d * isuml;
552
0
                d = sc[is++] & 0xF;
553
0
                isuml = 0;
554
0
                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
555
0
                isum += d * isuml;
556
0
                shift += 2;
557
0
                q8 += 32;
558
0
            }
559
0
            q2 += 32;
560
0
        }
561
0
        sumf += dall * isum - dmin * summs;
562
0
    }
563
0
    *s = sumf;
564
0
}
565
566
0
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
567
0
    assert(n % QK_K == 0);
568
0
    assert(nrc == 1);
569
0
    UNUSED(nrc);
570
0
    UNUSED(bx);
571
0
    UNUSED(by);
572
0
    UNUSED(bs);
573
574
0
    const uint32_t kmask1 = 0x03030303;
575
0
    const uint32_t kmask2 = 0x0f0f0f0f;
576
577
0
    const block_q3_K * GGML_RESTRICT x = vx;
578
0
    const block_q8_K * GGML_RESTRICT y = vy;
579
580
0
    const int nb = n / QK_K;
581
582
    // scalar version
583
    // This function is written like this so the compiler can manage to vectorize most of it
584
    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
585
    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
586
    // The ideal situation would be if we could just write the code once, and the compiler would
587
    // automatically produce the best possible set of machine instructions, instead of us having to manually
588
    // write vectorized versions for AVX, ARM_NEON, etc.
589
590
0
    int8_t  aux8[QK_K];
591
0
    int16_t aux16[8];
592
0
    float   sums [8];
593
0
    int32_t aux32[8];
594
0
    memset(sums, 0, 8*sizeof(float));
595
596
0
    uint32_t auxs[4];
597
0
    const int8_t * scales = (const int8_t*)auxs;
598
599
0
    float sumf = 0;
600
0
    for (int i = 0; i < nb; ++i) {
601
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
602
0
        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
603
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
604
0
        memset(aux32, 0, 8*sizeof(int32_t));
605
0
        int8_t * GGML_RESTRICT a = aux8;
606
0
        uint8_t m = 1;
607
0
        for (int j = 0; j < QK_K; j += 128) {
608
0
            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
609
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
610
0
            a += 32; m <<= 1;
611
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
612
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
613
0
            a += 32; m <<= 1;
614
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
615
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
616
0
            a += 32; m <<= 1;
617
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
618
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
619
0
            a += 32; m <<= 1;
620
0
            q3 += 32;
621
0
        }
622
0
        a = aux8;
623
624
0
        memcpy(auxs, x[i].scales, 12);
625
0
        uint32_t tmp = auxs[2];
626
0
        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
627
0
        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
628
0
        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
629
0
        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
630
0
        for (int j = 0; j < QK_K/16; ++j) {
631
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
632
0
            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
633
0
            q8 += 8; a += 8;
634
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
635
0
            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
636
0
            q8 += 8; a += 8;
637
0
        }
638
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
639
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
640
0
    }
641
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
642
0
    *s = sumf;
643
0
}
644
645
0
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
646
0
    assert(n % QK_K == 0);
647
0
    assert(nrc == 1);
648
0
    UNUSED(nrc);
649
0
    UNUSED(bx);
650
0
    UNUSED(by);
651
0
    UNUSED(bs);
652
653
0
    const block_q4_K * GGML_RESTRICT x = vx;
654
0
    const block_q8_K * GGML_RESTRICT y = vy;
655
656
0
    const int nb = n / QK_K;
657
658
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
659
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
660
0
    static const uint32_t kmask3 = 0x03030303;
661
662
0
    uint32_t utmp[4];
663
664
0
    const uint8_t * scales = (const uint8_t*)&utmp[0];
665
0
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
666
667
0
    int8_t  aux8[QK_K];
668
0
    int16_t aux16[8];
669
0
    float   sums [8];
670
0
    int32_t aux32[8];
671
0
    memset(sums, 0, 8*sizeof(float));
672
673
0
    float sumf = 0;
674
0
    for (int i = 0; i < nb; ++i) {
675
0
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
676
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
677
0
        memset(aux32, 0, 8*sizeof(int32_t));
678
0
        int8_t * GGML_RESTRICT a = aux8;
679
0
        for (int j = 0; j < QK_K/64; ++j) {
680
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
681
0
            a += 32;
682
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
683
0
            a += 32; q4 += 32;
684
0
        }
685
0
        memcpy(utmp, x[i].scales, 12);
686
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
687
0
        const uint32_t uaux = utmp[1] & kmask1;
688
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
689
0
        utmp[2] = uaux;
690
0
        utmp[0] &= kmask1;
691
692
0
        int sumi = 0;
693
0
        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
694
0
        a = aux8;
695
0
        int is = 0;
696
0
        for (int j = 0; j < QK_K/32; ++j) {
697
0
            int32_t scale = scales[is++];
698
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
699
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
700
0
            q8 += 8; a += 8;
701
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
702
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
703
0
            q8 += 8; a += 8;
704
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
705
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
706
0
            q8 += 8; a += 8;
707
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
708
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
709
0
            q8 += 8; a += 8;
710
0
        }
711
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
712
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
713
0
        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
714
0
        sumf -= dmin * sumi;
715
0
    }
716
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
717
0
    *s = sumf;
718
0
}
719
720
0
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
721
0
    assert(n % QK_K == 0);
722
0
    assert(nrc == 1);
723
0
    UNUSED(nrc);
724
0
    UNUSED(bx);
725
0
    UNUSED(by);
726
0
    UNUSED(bs);
727
728
0
    const block_q5_K * GGML_RESTRICT x = vx;
729
0
    const block_q8_K * GGML_RESTRICT y = vy;
730
731
0
    const int nb = n / QK_K;
732
733
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
734
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
735
0
    static const uint32_t kmask3 = 0x03030303;
736
737
0
    uint32_t utmp[4];
738
739
0
    const uint8_t * scales = (const uint8_t*)&utmp[0];
740
0
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
741
742
0
    int8_t  aux8[QK_K];
743
0
    int16_t aux16[8];
744
0
    float   sums [8];
745
0
    int32_t aux32[8];
746
0
    memset(sums, 0, 8*sizeof(float));
747
748
0
    float sumf = 0;
749
0
    for (int i = 0; i < nb; ++i) {
750
0
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
751
0
        const uint8_t * GGML_RESTRICT hm = x[i].qh;
752
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
753
0
        memset(aux32, 0, 8*sizeof(int32_t));
754
0
        int8_t * GGML_RESTRICT a = aux8;
755
0
        uint8_t m = 1;
756
0
        for (int j = 0; j < QK_K/64; ++j) {
757
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
758
0
            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
759
0
            a += 32; m <<= 1;
760
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
761
0
            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
762
0
            a += 32; m <<= 1;
763
0
            q4 += 32;
764
0
        }
765
0
        memcpy(utmp, x[i].scales, 12);
766
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
767
0
        const uint32_t uaux = utmp[1] & kmask1;
768
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
769
0
        utmp[2] = uaux;
770
0
        utmp[0] &= kmask1;
771
772
0
        int sumi = 0;
773
0
        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
774
0
        a = aux8;
775
0
        int is = 0;
776
0
        for (int j = 0; j < QK_K/32; ++j) {
777
0
            int32_t scale = scales[is++];
778
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
779
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
780
0
            q8 += 8; a += 8;
781
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
782
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
783
0
            q8 += 8; a += 8;
784
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
785
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
786
0
            q8 += 8; a += 8;
787
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
788
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
789
0
            q8 += 8; a += 8;
790
0
        }
791
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
792
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
793
0
        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
794
0
        sumf -= dmin * sumi;
795
0
    }
796
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
797
0
    *s = sumf;
798
0
}
799
800
0
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
801
0
    assert(n % QK_K == 0);
802
0
    assert(nrc == 1);
803
0
    UNUSED(nrc);
804
0
    UNUSED(bx);
805
0
    UNUSED(by);
806
0
    UNUSED(bs);
807
808
0
    const block_q6_K * GGML_RESTRICT x = vx;
809
0
    const block_q8_K * GGML_RESTRICT y = vy;
810
811
0
    const int nb = n / QK_K;
812
813
0
    int8_t  aux8[QK_K];
814
0
    int16_t aux16[8];
815
0
    float   sums [8];
816
0
    int32_t aux32[8];
817
0
    memset(sums, 0, 8*sizeof(float));
818
819
0
    float sumf = 0;
820
0
    for (int i = 0; i < nb; ++i) {
821
0
        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
822
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
823
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
824
0
        memset(aux32, 0, 8*sizeof(int32_t));
825
0
        int8_t * GGML_RESTRICT a = aux8;
826
0
        for (int j = 0; j < QK_K; j += 128) {
827
0
            for (int l = 0; l < 32; ++l) {
828
0
                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
829
0
                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
830
0
                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
831
0
                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
832
0
            }
833
0
            a  += 128;
834
0
            q4 += 64;
835
0
            qh += 32;
836
0
        }
837
0
        a = aux8;
838
0
        int is = 0;
839
0
        for (int j = 0; j < QK_K/16; ++j) {
840
0
            int scale = x[i].scales[is++];
841
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
842
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
843
0
            q8 += 8; a += 8;
844
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
845
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
846
0
            q8 += 8; a += 8;
847
0
        }
848
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
849
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
850
0
    }
851
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
852
0
    *s = sumf;
853
0
}
854
855
0
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
856
0
    assert(n % QK_K == 0);
857
0
    assert(nrc == 1);
858
0
    UNUSED(nrc);
859
0
    UNUSED(bx);
860
0
    UNUSED(by);
861
0
    UNUSED(bs);
862
863
0
    const block_iq2_xxs * GGML_RESTRICT x = vx;
864
0
    const block_q8_K    * GGML_RESTRICT y = vy;
865
866
0
    const int nb = n / QK_K;
867
868
0
    uint32_t aux32[2];
869
0
    const uint8_t * aux8 = (const uint8_t *)aux32;
870
871
0
    float sumf = 0.f;
872
0
    for (int i = 0; i < nb; ++i) {
873
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
874
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
875
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
876
0
        int32_t bsum = 0;
877
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
878
0
            memcpy(aux32, q2, 2*sizeof(uint32_t));
879
0
            q2 += 4;
880
0
            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
881
0
            int32_t sumi = 0;
882
0
            for (int l = 0; l < 4; ++l) {
883
0
                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
884
0
                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
885
0
                for (int j = 0; j < 8; ++j) {
886
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
887
0
                }
888
0
                q8 += 8;
889
0
            }
890
0
            bsum += sumi * ls;
891
0
        }
892
0
        sumf += d * bsum;
893
0
    }
894
0
    *s = 0.125f * sumf;
895
0
}
896
897
0
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
898
0
    assert(n % QK_K == 0);
899
0
    assert(nrc == 1);
900
0
    UNUSED(nrc);
901
0
    UNUSED(bx);
902
0
    UNUSED(by);
903
0
    UNUSED(bs);
904
905
0
    const block_iq2_xs * GGML_RESTRICT x = vx;
906
0
    const block_q8_K   * GGML_RESTRICT y = vy;
907
908
0
    const int nb = n / QK_K;
909
910
0
    float sumf = 0.f;
911
0
    for (int i = 0; i < nb; ++i) {
912
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
913
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
914
0
        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
915
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
916
0
        int32_t bsum = 0;
917
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
918
0
            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
919
0
            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
920
0
            int32_t sumi = 0;
921
0
            for (int l = 0; l < 2; ++l) {
922
0
                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
923
0
                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
924
0
                for (int j = 0; j < 8; ++j) {
925
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
926
0
                }
927
0
                q8 += 8;
928
0
            }
929
0
            bsum += sumi * ls1;
930
0
            sumi = 0;
931
0
            for (int l = 2; l < 4; ++l) {
932
0
                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
933
0
                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
934
0
                for (int j = 0; j < 8; ++j) {
935
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
936
0
                }
937
0
                q8 += 8;
938
0
            }
939
0
            bsum += sumi * ls2;
940
0
            q2 += 4;
941
0
        }
942
0
        sumf += d * bsum;
943
0
    }
944
0
    *s = 0.125f * sumf;
945
0
}
946
947
0
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
948
0
    assert(n % QK_K == 0);
949
0
    assert(nrc == 1);
950
0
    UNUSED(nrc);
951
0
    UNUSED(bx);
952
0
    UNUSED(by);
953
0
    UNUSED(bs);
954
955
0
    const block_iq2_s * GGML_RESTRICT x = vx;
956
0
    const block_q8_K  * GGML_RESTRICT y = vy;
957
958
0
    const int nb = n / QK_K;
959
960
0
    float sumf = 0;
961
0
    for (int i = 0; i < nb; i++) {
962
963
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
964
0
        const int8_t  * q8 = y[i].qs;
965
0
        const uint8_t * qs = x[i].qs;
966
0
        const uint8_t * qh = x[i].qh;
967
0
        const uint8_t * signs = qs + QK_K/8;
968
969
0
        int bsum = 0;
970
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
971
0
            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
972
0
            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
973
0
            int sumi1 = 0, sumi2 = 0;
974
0
            for (int l = 0; l < 2; ++l) {
975
0
                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
976
0
                for (int j = 0; j < 8; ++j) {
977
0
                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
978
0
                }
979
0
                q8 += 8;
980
0
            }
981
0
            for (int l = 2; l < 4; ++l) {
982
0
                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
983
0
                for (int j = 0; j < 8; ++j) {
984
0
                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
985
0
                }
986
0
                q8 += 8;
987
0
            }
988
0
            bsum += ls1 * sumi1 + ls2 * sumi2;
989
0
            qs += 4;
990
0
            signs += 4;
991
0
        }
992
993
0
        sumf += d * bsum;
994
0
    }
995
996
0
    *s = 0.125f * sumf;
997
0
}
998
999
0
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1000
0
    assert(n % QK_K == 0);
1001
0
    assert(nrc == 1);
1002
0
    UNUSED(nrc);
1003
0
    UNUSED(bx);
1004
0
    UNUSED(by);
1005
0
    UNUSED(bs);
1006
1007
0
    const block_iq3_xxs * GGML_RESTRICT x = vx;
1008
0
    const block_q8_K    * GGML_RESTRICT y = vy;
1009
1010
0
    const int nb = n / QK_K;
1011
1012
0
    uint32_t aux32;
1013
1014
0
    float sumf = 0.f;
1015
0
    for (int i = 0; i < nb; ++i) {
1016
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1017
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1018
0
        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
1019
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1020
0
        int32_t bsum = 0;
1021
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1022
0
            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
1023
0
            const uint32_t ls = 2*(aux32 >> 28) + 1;
1024
0
            int32_t sumi = 0;
1025
0
            for (int l = 0; l < 4; ++l) {
1026
0
                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
1027
0
                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
1028
0
                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
1029
0
                for (int j = 0; j < 4; ++j) {
1030
0
                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
1031
0
                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
1032
0
                }
1033
0
                q8 += 8;
1034
0
            }
1035
0
            q3 += 8;
1036
0
            bsum += sumi * ls;
1037
0
        }
1038
0
        sumf += d * bsum;
1039
0
    }
1040
0
    *s = 0.25f * sumf;
1041
0
}
1042
1043
0
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1044
0
    assert(n % QK_K == 0);
1045
0
    assert(nrc == 1);
1046
0
    UNUSED(nrc);
1047
0
    UNUSED(bx);
1048
0
    UNUSED(by);
1049
0
    UNUSED(bs);
1050
1051
0
    const block_iq3_s * GGML_RESTRICT x = vx;
1052
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1053
1054
0
    const int nb = n / QK_K;
1055
1056
0
    float sumf = 0.f;
1057
0
    for (int i = 0; i < nb; ++i) {
1058
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1059
0
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
1060
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
1061
0
        const uint8_t * GGML_RESTRICT signs = x[i].signs;
1062
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1063
0
        int32_t bsum = 0;
1064
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
1065
0
            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
1066
0
            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
1067
0
            int32_t sumi = 0;
1068
0
            for (int l = 0; l < 4; ++l) {
1069
0
                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
1070
0
                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
1071
0
                for (int j = 0; j < 4; ++j) {
1072
0
                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
1073
0
                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
1074
0
                }
1075
0
                q8 += 8;
1076
0
            }
1077
0
            qs += 8;
1078
0
            signs += 4;
1079
0
            bsum += sumi * ls1;
1080
0
            sumi = 0;
1081
0
            for (int l = 0; l < 4; ++l) {
1082
0
                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
1083
0
                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
1084
0
                for (int j = 0; j < 4; ++j) {
1085
0
                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
1086
0
                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
1087
0
                }
1088
0
                q8 += 8;
1089
0
            }
1090
0
            qs += 8;
1091
0
            signs += 4;
1092
0
            bsum += sumi * ls2;
1093
0
        }
1094
0
        sumf += d * bsum;
1095
0
    }
1096
0
    *s = sumf;
1097
0
}
1098
1099
0
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1100
0
    assert(n % QK_K == 0);
1101
0
    assert(nrc == 1);
1102
0
    UNUSED(nrc);
1103
0
    UNUSED(bx);
1104
0
    UNUSED(by);
1105
0
    UNUSED(bs);
1106
1107
0
    const block_iq1_s * GGML_RESTRICT x = vx;
1108
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1109
1110
0
    const int nb = n / QK_K;
1111
1112
0
    float sumf = 0;
1113
0
    for (int i = 0; i < nb; i++) {
1114
1115
0
        const int8_t   * q8 = y[i].qs;
1116
0
        const uint8_t  * qs = x[i].qs;
1117
0
        const uint16_t * qh = x[i].qh;
1118
1119
0
        int sumi = 0, sumi1 = 0;
1120
0
        for (int ib = 0; ib < QK_K/32; ++ib) {
1121
0
            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
1122
0
            const int delta = qh[ib] & 0x8000 ? -1 : 1;
1123
0
            int lsum = 0;
1124
0
            for (int l = 0; l < 4; ++l) {
1125
0
                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
1126
0
                for (int j = 0; j < 8; ++j) {
1127
0
                    lsum += q8[j] * grid[j];
1128
0
                }
1129
0
                q8 += 8;
1130
0
            }
1131
0
            sumi  += ls * lsum;
1132
0
            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
1133
0
            qs += 4;
1134
0
        }
1135
1136
0
        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1137
0
    }
1138
1139
0
    *s = sumf;
1140
0
}
1141
1142
0
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1143
0
    assert(n % QK_K == 0);
1144
0
    assert(nrc == 1);
1145
0
    UNUSED(nrc);
1146
0
    UNUSED(bx);
1147
0
    UNUSED(by);
1148
0
    UNUSED(bs);
1149
1150
0
    const block_iq1_m * GGML_RESTRICT x = vx;
1151
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1152
1153
0
    const int nb = n / QK_K;
1154
1155
0
    iq1m_scale_t scale;
1156
1157
0
    int sum1[2], sum2[2], delta[4];
1158
1159
0
    float sumf = 0;
1160
0
    for (int i = 0; i < nb; i++) {
1161
1162
0
        const int8_t   * q8 = y[i].qs;
1163
0
        const uint8_t  * qs = x[i].qs;
1164
0
        const uint8_t  * qh = x[i].qh;
1165
0
        const uint16_t * sc = (const uint16_t *)x[i].scales;
1166
1167
0
        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1168
1169
0
        int sumi1 = 0, sumi2 = 0;
1170
0
        for (int ib = 0; ib < QK_K/32; ++ib) {
1171
0
            delta[0] = qh[0] & 0x08 ? -1 : 1;
1172
0
            delta[1] = qh[0] & 0x80 ? -1 : 1;
1173
0
            delta[2] = qh[1] & 0x08 ? -1 : 1;
1174
0
            delta[3] = qh[1] & 0x80 ? -1 : 1;
1175
0
            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
1176
0
            for (int l = 0; l < 4; ++l) {
1177
0
                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
1178
0
                int lsum1 = 0, lsum2 = 0;
1179
0
                for (int j = 0; j < 8; ++j) {
1180
0
                    lsum1 += q8[j] * grid[j];
1181
0
                    lsum2 += q8[j];
1182
0
                }
1183
0
                q8 += 8;
1184
0
                sum1[l/2] += lsum1;
1185
0
                sum2[l/2] += lsum2*delta[l];
1186
0
            }
1187
1188
0
            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
1189
0
            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
1190
1191
0
            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
1192
0
            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
1193
0
            qs += 4;
1194
0
            qh += 2;
1195
0
        }
1196
1197
0
        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1198
0
    }
1199
1200
0
    *s = sumf;
1201
0
}
1202
1203
0
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1204
0
    assert(nrc == 1);
1205
0
    UNUSED(nrc);
1206
0
    UNUSED(bx);
1207
0
    UNUSED(by);
1208
0
    UNUSED(bs);
1209
0
    assert(n % QK4_NL == 0);
1210
0
    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1211
1212
0
    const block_iq4_nl * GGML_RESTRICT x = vx;
1213
0
    const block_q8_0   * GGML_RESTRICT y = vy;
1214
1215
0
    const int nb = n / QK4_NL;
1216
1217
0
    int ib = 0;
1218
0
    float sumf = 0;
1219
1220
0
    for (; ib < nb; ++ib) {
1221
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1222
0
        int sumi1 = 0, sumi2 = 0;
1223
0
        for (int j = 0; j < QK4_NL/2; ++j) {
1224
0
            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1225
0
            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
1226
0
        }
1227
0
        sumf += d * (sumi1 + sumi2);
1228
0
    }
1229
0
    *s = sumf;
1230
0
}
1231
1232
0
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1233
0
    assert(nrc == 1);
1234
0
    UNUSED(nrc);
1235
0
    UNUSED(bx);
1236
0
    UNUSED(by);
1237
0
    UNUSED(bs);
1238
0
    assert(n % QK_K == 0);
1239
1240
0
    const block_iq4_xs * GGML_RESTRICT x = vx;
1241
0
    const block_q8_K   * GGML_RESTRICT y = vy;
1242
1243
0
    const int nb = n / QK_K;
1244
1245
0
    float sumf = 0;
1246
0
    for (int ibl = 0; ibl < nb; ++ibl) {
1247
0
        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1248
0
        uint16_t h = x[ibl].scales_h;
1249
0
        const uint8_t * qs = x[ibl].qs;
1250
0
        const int8_t  * q8 = y[ibl].qs;
1251
0
        for (int ib = 0; ib < QK_K/32; ib += 2) {
1252
0
            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1253
0
            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
1254
0
            h >>= 4;
1255
0
            const float d1 = d4d8*(ls1 - 32);
1256
0
            const float d2 = d4d8*(ls2 - 32);
1257
0
            int sumi1 = 0, sumi2 = 0;
1258
0
            for (int j = 0; j < 16; ++j) {
1259
0
                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1260
0
                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
1261
0
            }
1262
0
            sumf += d1 * (sumi1 + sumi2);
1263
0
            qs += 16;
1264
0
            q8 += 32;
1265
0
            sumi1 = sumi2 = 0;
1266
0
            for (int j = 0; j < 16; ++j) {
1267
0
                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1268
0
                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
1269
0
            }
1270
0
            sumf += d2 * (sumi1 + sumi2);
1271
0
            qs += 16;
1272
0
            q8 += 32;
1273
0
        }
1274
0
    }
1275
0
    *s = sumf;
1276
0
}
1277
1278
// ============================ 4-bit non-linear quants
1279
1280
0
void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1281
0
    assert(k % QK4_NL == 0);
1282
0
    quantize_row_iq4_nl_ref(x, y, k);
1283
0
}
1284
1285
0
void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1286
0
    assert(k % QK_K == 0);
1287
    quantize_iq4_xs(x, y, 1, k, NULL);
1288
0
}