Coverage Report

Created: 2026-01-13 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/quants.c
Line
Count
Source
1
#define GGML_COMMON_IMPL_C
2
#include "ggml-common.h"
3
4
#include "ggml-cpu-impl.h"
5
#include "simd-mappings.h"
6
#include "ggml-quants.h"
7
#include "quants.h"
8
9
#include "arch-fallback.h"
10
11
#include <string.h>
12
#include <assert.h>
13
#include <float.h>
14
#include <stdlib.h> // for qsort
15
#include <stdio.h>  // for GGML_ASSERT
16
17
#define GROUP_MAX_EPS 1e-15f
18
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
#define GROUP_MAX_EPS_IQ2_S 1e-8f
20
#define GROUP_MAX_EPS_IQ1_M 1e-7f
21
#define GROUP_MAX_EPS_IQ1_S 1e-12f
22
23
0
#define UNUSED GGML_UNUSED
24
25
0
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
26
0
    quantize_row_q4_0_ref(x, y, k);
27
0
}
28
29
0
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
30
0
    quantize_row_q4_1_ref(x, y, k);
31
0
}
32
33
0
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
34
0
    quantize_row_q5_0_ref(x, y, k);
35
0
}
36
37
0
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
38
0
    quantize_row_q5_1_ref(x, y, k);
39
0
}
40
41
0
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
42
0
    quantize_row_q8_0_ref(x, y, k);
43
0
}
44
45
0
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
46
0
    quantize_row_q8_1_ref(x, y, k);
47
0
}
48
49
0
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
50
0
    quantize_row_mxfp4_ref(x, y, k);
51
0
}
52
53
//
54
// 2-6 bit quantization in super-blocks
55
//
56
57
//========================- 2-bit (de)-quantization
58
59
0
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
60
0
    quantize_row_q2_K_ref(x, vy, k);
61
0
}
62
63
//========================= 3-bit (de)-quantization
64
65
0
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
66
0
    quantize_row_q3_K_ref(x, vy, k);
67
0
}
68
69
// ====================== 4-bit (de)-quantization
70
71
0
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
72
0
    assert(k % QK_K == 0);
73
0
    block_q4_K * GGML_RESTRICT y = vy;
74
0
    quantize_row_q4_K_ref(x, y, k);
75
0
}
76
77
// ====================== 5-bit (de)-quantization
78
79
0
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
80
0
    assert(k % QK_K == 0);
81
0
    block_q5_K * GGML_RESTRICT y = vy;
82
0
    quantize_row_q5_K_ref(x, y, k);
83
0
}
84
85
// ====================== 6-bit (de)-quantization
86
87
0
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
88
0
    assert(k % QK_K == 0);
89
0
    block_q6_K * GGML_RESTRICT y = vy;
90
0
    quantize_row_q6_K_ref(x, y, k);
91
0
}
92
93
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
94
95
0
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
96
0
    assert(k % QK_K == 0);
97
0
    block_tq1_0 * GGML_RESTRICT y = vy;
98
0
    quantize_row_tq1_0_ref(x, y, k);
99
0
}
100
101
0
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
102
0
    assert(k % QK_K == 0);
103
0
    block_tq2_0 * GGML_RESTRICT y = vy;
104
0
    quantize_row_tq2_0_ref(x, y, k);
105
0
}
106
107
//===================================== Q8_K ==============================================
108
109
0
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
110
0
    quantize_row_q8_K_ref(x, y, k);
111
0
}
112
113
//===================================== Dot products =================================
114
115
0
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
116
0
    const int qk = QK8_0;
117
0
    const int nb = n / qk;
118
119
0
    assert(n % qk == 0);
120
0
    assert(nrc == 1);
121
0
    UNUSED(nrc);
122
0
    UNUSED(bx);
123
0
    UNUSED(by);
124
0
    UNUSED(bs);
125
126
0
    const block_q4_0 * GGML_RESTRICT x = vx;
127
0
    const block_q8_0 * GGML_RESTRICT y = vy;
128
129
0
    int ib = 0;
130
0
    float sumf = 0;
131
132
0
    for (; ib < nb; ++ib) {
133
0
        int sumi0 = 0;
134
0
        int sumi1 = 0;
135
136
0
        for (int j = 0; j < qk/2; ++j) {
137
0
            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
138
0
            const int v1 = (x[ib].qs[j] >>   4) - 8;
139
140
0
            sumi0 += (v0 * y[ib].qs[j]);
141
0
            sumi1 += (v1 * y[ib].qs[j + qk/2]);
142
0
        }
143
144
0
        int sumi = sumi0 + sumi1;
145
0
        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
146
0
    }
147
148
0
    *s = sumf;
149
0
}
150
151
// TODO: add WASM SIMD
152
0
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
153
0
    const int qk = QK8_1;
154
0
    const int nb = n / qk;
155
156
0
    assert(n % qk == 0);
157
0
    assert(nrc == 1);
158
0
    UNUSED(nrc);
159
0
    UNUSED(bx);
160
0
    UNUSED(by);
161
0
    UNUSED(bs);
162
163
0
    const block_q4_1 * GGML_RESTRICT x = vx;
164
0
    const block_q8_1 * GGML_RESTRICT y = vy;
165
166
0
    int ib = 0;
167
0
    float sumf = 0;
168
169
0
    for (; ib < nb; ++ib) {
170
0
        int sumi0 = 0;
171
0
        int sumi1 = 0;
172
173
0
        for (int j = 0; j < qk/2; ++j) {
174
0
            const int v0 = (x[ib].qs[j] & 0x0F);
175
0
            const int v1 = (x[ib].qs[j] >>   4);
176
177
0
            sumi0 += (v0 * y[ib].qs[j]);
178
0
            sumi1 += (v1 * y[ib].qs[j + qk/2]);
179
0
        }
180
181
0
        int sumi = sumi0 + sumi1;
182
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
183
0
    }
184
185
0
    *s = sumf;
186
0
}
187
188
0
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
189
0
    assert(nrc == 1);
190
0
    UNUSED(nrc);
191
0
    UNUSED(bx);
192
0
    UNUSED(by);
193
0
    UNUSED(bs);
194
0
    assert(n % QK_MXFP4 == 0);
195
0
    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
196
197
0
    const block_mxfp4 * GGML_RESTRICT x = vx;
198
0
    const block_q8_0 * GGML_RESTRICT y = vy;
199
200
0
    const int nb = n / QK_MXFP4;
201
202
0
    int ib = 0;
203
0
    float sumf = 0;
204
205
0
    for (; ib < nb; ++ib) {
206
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
207
208
0
        int sumi1 = 0;
209
0
        int sumi2 = 0;
210
0
        for (int j = 0; j < QK_MXFP4/2; ++j) {
211
0
            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
212
0
            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
213
0
        }
214
0
        sumf += d * (sumi1 + sumi2);
215
0
    }
216
0
    *s = sumf;
217
0
}
218
219
0
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
220
0
    const int qk = QK8_0;
221
0
    const int nb = n / qk;
222
223
0
    int ib = 0;
224
0
    float sumf = 0;
225
226
0
    assert(n % qk == 0);
227
0
    assert(qk == QK5_0);
228
0
    assert(nrc == 1);
229
0
    UNUSED(nrc);
230
0
    UNUSED(bx);
231
0
    UNUSED(by);
232
0
    UNUSED(bs);
233
234
0
    const block_q5_0 * GGML_RESTRICT x = vx;
235
0
    const block_q8_0 * GGML_RESTRICT y = vy;
236
237
0
    for (; ib < nb; ++ib) {
238
0
        uint32_t qh;
239
0
        memcpy(&qh, x[ib].qh, sizeof(qh));
240
241
0
        int sumi0 = 0;
242
0
        int sumi1 = 0;
243
244
0
        for (int j = 0; j < qk/2; ++j) {
245
0
            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
246
0
            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
247
248
0
            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
249
0
            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
250
251
0
            sumi0 += (x0 * y[ib].qs[j]);
252
0
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
253
0
        }
254
255
0
        int sumi = sumi0 + sumi1;
256
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
257
0
    }
258
259
0
    *s = sumf;
260
0
}
261
262
0
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
263
0
    const int qk = QK8_1;
264
0
    const int nb = n / qk;
265
266
0
    int ib = 0;
267
0
    float sumf = 0;
268
269
0
    assert(n % qk == 0);
270
0
    assert(qk == QK5_1);
271
0
    assert(nrc == 1);
272
0
    UNUSED(nrc);
273
0
    UNUSED(bx);
274
0
    UNUSED(by);
275
0
    UNUSED(bs);
276
277
0
    const block_q5_1 * GGML_RESTRICT x = vx;
278
0
    const block_q8_1 * GGML_RESTRICT y = vy;
279
280
0
    for (; ib < nb; ++ib) {
281
0
        uint32_t qh;
282
0
        memcpy(&qh, x[ib].qh, sizeof(qh));
283
284
0
        int sumi0 = 0;
285
0
        int sumi1 = 0;
286
287
0
        for (int j = 0; j < qk/2; ++j) {
288
0
            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
289
0
            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
290
291
0
            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
292
0
            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
293
294
0
            sumi0 += (x0 * y[ib].qs[j]);
295
0
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
296
0
        }
297
298
0
        int sumi = sumi0 + sumi1;
299
0
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
300
0
    }
301
302
0
    *s = sumf;
303
0
}
304
305
0
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
306
0
    const int qk = QK8_0;
307
0
    const int nb = n / qk;
308
309
0
    assert(n % qk == 0);
310
0
    assert(nrc == 1);
311
0
    UNUSED(nrc);
312
0
    UNUSED(bx);
313
0
    UNUSED(by);
314
0
    UNUSED(bs);
315
316
0
    const block_q8_0 * GGML_RESTRICT x = vx;
317
0
    const block_q8_0 * GGML_RESTRICT y = vy;
318
319
0
    int ib = 0;
320
0
    float sumf = 0;
321
322
0
    for (; ib < nb; ++ib) {
323
0
        int sumi = 0;
324
325
0
        for (int j = 0; j < qk; j++) {
326
0
            sumi += x[ib].qs[j]*y[ib].qs[j];
327
0
        }
328
329
0
        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
330
0
    }
331
332
0
    *s = sumf;
333
0
}
334
335
0
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
336
0
    assert(nrc == 1);
337
0
    UNUSED(nrc);
338
0
    UNUSED(bx);
339
0
    UNUSED(by);
340
0
    UNUSED(bs);
341
342
0
    const block_tq1_0 * GGML_RESTRICT x = vx;
343
0
    const block_q8_K  * GGML_RESTRICT y = vy;
344
345
0
    const int nb = n / QK_K;
346
347
0
    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
348
349
0
    float sumf = 0.0f;
350
351
0
    for (int i = 0; i < nb; ++i) {
352
0
        int sum = 0;
353
354
0
        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
355
0
            for (size_t l = 0; l < 5; ++l) {
356
0
                for (size_t m = 0; m < 32; ++m) {
357
0
                    uint8_t q = x[i].qs[j + m] * pow3[l];
358
0
                    uint16_t xi = ((uint16_t) q * 3) >> 8;
359
0
                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
360
0
                }
361
0
            }
362
0
        }
363
0
        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
364
0
            for (size_t l = 0; l < 5; ++l) {
365
0
                for (size_t m = 0; m < 16; ++m) {
366
0
                    uint8_t q = x[i].qs[j + m] * pow3[l];
367
0
                    uint16_t xi = ((uint16_t) q * 3) >> 8;
368
0
                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
369
0
                }
370
0
            }
371
0
        }
372
373
0
        for (size_t l = 0; l < 4; ++l) {
374
0
            for (size_t j = 0; j < sizeof(x->qh); ++j) {
375
0
                uint8_t q = x[i].qh[j] * pow3[l];
376
0
                uint16_t xi = ((uint16_t) q * 3) >> 8;
377
0
                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
378
0
            }
379
0
        }
380
381
0
        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
382
0
    }
383
384
0
    *s = sumf;
385
0
}
386
387
0
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
388
0
    assert(nrc == 1);
389
0
    UNUSED(nrc);
390
0
    UNUSED(bx);
391
0
    UNUSED(by);
392
0
    UNUSED(bs);
393
394
0
    const block_tq2_0 * GGML_RESTRICT x = vx;
395
0
    const block_q8_K  * GGML_RESTRICT y = vy;
396
397
0
    const int nb = n / QK_K;
398
0
    float sumf = 0.0f;
399
400
0
    for (int i = 0; i < nb; ++i) {
401
0
        int32_t sumi = 0;
402
403
0
        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
404
0
            for (size_t l = 0; l < 4; ++l) {
405
0
                for (size_t k = 0; k < 32; ++k) {
406
0
                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
407
0
                }
408
0
            }
409
0
        }
410
411
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
412
413
0
        sumf += (float) sumi * d;
414
0
    }
415
416
0
    *s = sumf;
417
0
}
418
419
0
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
420
0
    assert(nrc == 1);
421
0
    UNUSED(nrc);
422
0
    UNUSED(bx);
423
0
    UNUSED(by);
424
0
    UNUSED(bs);
425
426
0
    const block_q2_K * GGML_RESTRICT x = vx;
427
0
    const block_q8_K * GGML_RESTRICT y = vy;
428
429
0
    const int nb = n / QK_K;
430
431
0
    float sumf = 0;
432
433
0
    for (int i = 0; i < nb; ++i) {
434
435
0
        const uint8_t * q2 = x[i].qs;
436
0
        const  int8_t * q8 = y[i].qs;
437
0
        const uint8_t * sc = x[i].scales;
438
439
0
        int summs = 0;
440
0
        for (int j = 0; j < 16; ++j) {
441
0
            summs += y[i].bsums[j] * (sc[j] >> 4);
442
0
        }
443
444
0
        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
445
0
        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
446
447
0
        int isum = 0;
448
0
        int is = 0;
449
0
        int d;
450
0
        for (int k = 0; k < QK_K/128; ++k) {
451
0
            int shift = 0;
452
0
            for (int j = 0; j < 4; ++j) {
453
0
                d = sc[is++] & 0xF;
454
0
                int isuml = 0;
455
0
                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
456
0
                isum += d * isuml;
457
0
                d = sc[is++] & 0xF;
458
0
                isuml = 0;
459
0
                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
460
0
                isum += d * isuml;
461
0
                shift += 2;
462
0
                q8 += 32;
463
0
            }
464
0
            q2 += 32;
465
0
        }
466
0
        sumf += dall * isum - dmin * summs;
467
0
    }
468
0
    *s = sumf;
469
0
}
470
471
0
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
472
0
    assert(n % QK_K == 0);
473
0
    assert(nrc == 1);
474
0
    UNUSED(nrc);
475
0
    UNUSED(bx);
476
0
    UNUSED(by);
477
0
    UNUSED(bs);
478
479
0
    const uint32_t kmask1 = 0x03030303;
480
0
    const uint32_t kmask2 = 0x0f0f0f0f;
481
482
0
    const block_q3_K * GGML_RESTRICT x = vx;
483
0
    const block_q8_K * GGML_RESTRICT y = vy;
484
485
0
    const int nb = n / QK_K;
486
487
    // scalar version
488
    // This function is written like this so the compiler can manage to vectorize most of it
489
    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
490
    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
491
    // The ideal situation would be if we could just write the code once, and the compiler would
492
    // automatically produce the best possible set of machine instructions, instead of us having to manually
493
    // write vectorized versions for AVX, ARM_NEON, etc.
494
495
0
    int8_t  aux8[QK_K];
496
0
    int16_t aux16[8];
497
0
    float   sums [8];
498
0
    int32_t aux32[8];
499
0
    memset(sums, 0, 8*sizeof(float));
500
501
0
    uint32_t auxs[4];
502
0
    const int8_t * scales = (const int8_t*)auxs;
503
504
0
    float sumf = 0;
505
0
    for (int i = 0; i < nb; ++i) {
506
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
507
0
        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
508
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
509
0
        memset(aux32, 0, 8*sizeof(int32_t));
510
0
        int8_t * GGML_RESTRICT a = aux8;
511
0
        uint8_t m = 1;
512
0
        for (int j = 0; j < QK_K; j += 128) {
513
0
            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
514
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
515
0
            a += 32; m <<= 1;
516
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
517
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
518
0
            a += 32; m <<= 1;
519
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
520
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
521
0
            a += 32; m <<= 1;
522
0
            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
523
0
            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
524
0
            a += 32; m <<= 1;
525
0
            q3 += 32;
526
0
        }
527
0
        a = aux8;
528
529
0
        memcpy(auxs, x[i].scales, 12);
530
0
        uint32_t tmp = auxs[2];
531
0
        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
532
0
        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
533
0
        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
534
0
        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
535
0
        for (int j = 0; j < QK_K/16; ++j) {
536
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
537
0
            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
538
0
            q8 += 8; a += 8;
539
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
540
0
            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
541
0
            q8 += 8; a += 8;
542
0
        }
543
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
544
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
545
0
    }
546
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
547
0
    *s = sumf;
548
0
}
549
550
0
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
551
0
    assert(n % QK_K == 0);
552
0
    assert(nrc == 1);
553
0
    UNUSED(nrc);
554
0
    UNUSED(bx);
555
0
    UNUSED(by);
556
0
    UNUSED(bs);
557
558
0
    const block_q4_K * GGML_RESTRICT x = vx;
559
0
    const block_q8_K * GGML_RESTRICT y = vy;
560
561
0
    const int nb = n / QK_K;
562
563
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
564
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
565
0
    static const uint32_t kmask3 = 0x03030303;
566
567
0
    uint32_t utmp[4];
568
569
0
    const uint8_t * scales = (const uint8_t*)&utmp[0];
570
0
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
571
572
0
    int8_t  aux8[QK_K];
573
0
    int16_t aux16[8];
574
0
    float   sums [8];
575
0
    int32_t aux32[8];
576
0
    memset(sums, 0, 8*sizeof(float));
577
578
0
    float sumf = 0;
579
0
    for (int i = 0; i < nb; ++i) {
580
0
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
581
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
582
0
        memset(aux32, 0, 8*sizeof(int32_t));
583
0
        int8_t * GGML_RESTRICT a = aux8;
584
0
        for (int j = 0; j < QK_K/64; ++j) {
585
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
586
0
            a += 32;
587
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
588
0
            a += 32; q4 += 32;
589
0
        }
590
0
        memcpy(utmp, x[i].scales, 12);
591
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
592
0
        const uint32_t uaux = utmp[1] & kmask1;
593
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
594
0
        utmp[2] = uaux;
595
0
        utmp[0] &= kmask1;
596
597
0
        int sumi = 0;
598
0
        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
599
0
        a = aux8;
600
0
        int is = 0;
601
0
        for (int j = 0; j < QK_K/32; ++j) {
602
0
            int32_t scale = scales[is++];
603
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
604
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
605
0
            q8 += 8; a += 8;
606
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
607
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
608
0
            q8 += 8; a += 8;
609
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
610
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
611
0
            q8 += 8; a += 8;
612
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
613
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
614
0
            q8 += 8; a += 8;
615
0
        }
616
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
617
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
618
0
        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
619
0
        sumf -= dmin * sumi;
620
0
    }
621
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
622
0
    *s = sumf;
623
0
}
624
625
0
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
626
0
    assert(n % QK_K == 0);
627
0
    assert(nrc == 1);
628
0
    UNUSED(nrc);
629
0
    UNUSED(bx);
630
0
    UNUSED(by);
631
0
    UNUSED(bs);
632
633
0
    const block_q5_K * GGML_RESTRICT x = vx;
634
0
    const block_q8_K * GGML_RESTRICT y = vy;
635
636
0
    const int nb = n / QK_K;
637
638
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
639
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
640
0
    static const uint32_t kmask3 = 0x03030303;
641
642
0
    uint32_t utmp[4];
643
644
0
    const uint8_t * scales = (const uint8_t*)&utmp[0];
645
0
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
646
647
0
    int8_t  aux8[QK_K];
648
0
    int16_t aux16[8];
649
0
    float   sums [8];
650
0
    int32_t aux32[8];
651
0
    memset(sums, 0, 8*sizeof(float));
652
653
0
    float sumf = 0;
654
0
    for (int i = 0; i < nb; ++i) {
655
0
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
656
0
        const uint8_t * GGML_RESTRICT hm = x[i].qh;
657
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
658
0
        memset(aux32, 0, 8*sizeof(int32_t));
659
0
        int8_t * GGML_RESTRICT a = aux8;
660
0
        uint8_t m = 1;
661
0
        for (int j = 0; j < QK_K/64; ++j) {
662
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
663
0
            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
664
0
            a += 32; m <<= 1;
665
0
            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
666
0
            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
667
0
            a += 32; m <<= 1;
668
0
            q4 += 32;
669
0
        }
670
0
        memcpy(utmp, x[i].scales, 12);
671
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
672
0
        const uint32_t uaux = utmp[1] & kmask1;
673
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
674
0
        utmp[2] = uaux;
675
0
        utmp[0] &= kmask1;
676
677
0
        int sumi = 0;
678
0
        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
679
0
        a = aux8;
680
0
        int is = 0;
681
0
        for (int j = 0; j < QK_K/32; ++j) {
682
0
            int32_t scale = scales[is++];
683
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
684
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
685
0
            q8 += 8; a += 8;
686
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
687
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
688
0
            q8 += 8; a += 8;
689
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
690
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
691
0
            q8 += 8; a += 8;
692
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
693
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
694
0
            q8 += 8; a += 8;
695
0
        }
696
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
697
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
698
0
        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
699
0
        sumf -= dmin * sumi;
700
0
    }
701
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
702
0
    *s = sumf;
703
0
}
704
705
0
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
706
0
    assert(n % QK_K == 0);
707
0
    assert(nrc == 1);
708
0
    UNUSED(nrc);
709
0
    UNUSED(bx);
710
0
    UNUSED(by);
711
0
    UNUSED(bs);
712
713
0
    const block_q6_K * GGML_RESTRICT x = vx;
714
0
    const block_q8_K * GGML_RESTRICT y = vy;
715
716
0
    const int nb = n / QK_K;
717
718
0
    int8_t  aux8[QK_K];
719
0
    int16_t aux16[8];
720
0
    float   sums [8];
721
0
    int32_t aux32[8];
722
0
    memset(sums, 0, 8*sizeof(float));
723
724
0
    float sumf = 0;
725
0
    for (int i = 0; i < nb; ++i) {
726
0
        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
727
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
728
0
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
729
0
        memset(aux32, 0, 8*sizeof(int32_t));
730
0
        int8_t * GGML_RESTRICT a = aux8;
731
0
        for (int j = 0; j < QK_K; j += 128) {
732
0
            for (int l = 0; l < 32; ++l) {
733
0
                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
734
0
                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
735
0
                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
736
0
                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
737
0
            }
738
0
            a  += 128;
739
0
            q4 += 64;
740
0
            qh += 32;
741
0
        }
742
0
        a = aux8;
743
0
        int is = 0;
744
0
        for (int j = 0; j < QK_K/16; ++j) {
745
0
            int scale = x[i].scales[is++];
746
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
747
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
748
0
            q8 += 8; a += 8;
749
0
            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
750
0
            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
751
0
            q8 += 8; a += 8;
752
0
        }
753
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
754
0
        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
755
0
    }
756
0
    for (int l = 0; l < 8; ++l) sumf += sums[l];
757
0
    *s = sumf;
758
0
}
759
760
0
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
761
0
    assert(n % QK_K == 0);
762
0
    assert(nrc == 1);
763
0
    UNUSED(nrc);
764
0
    UNUSED(bx);
765
0
    UNUSED(by);
766
0
    UNUSED(bs);
767
768
0
    const block_iq2_xxs * GGML_RESTRICT x = vx;
769
0
    const block_q8_K    * GGML_RESTRICT y = vy;
770
771
0
    const int nb = n / QK_K;
772
773
0
    uint32_t aux32[2];
774
0
    const uint8_t * aux8 = (const uint8_t *)aux32;
775
776
0
    float sumf = 0.f;
777
0
    for (int i = 0; i < nb; ++i) {
778
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
779
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
780
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
781
0
        int32_t bsum = 0;
782
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
783
0
            memcpy(aux32, q2, 2*sizeof(uint32_t));
784
0
            q2 += 4;
785
0
            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
786
0
            int32_t sumi = 0;
787
0
            for (int l = 0; l < 4; ++l) {
788
0
                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
789
0
                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
790
0
                for (int j = 0; j < 8; ++j) {
791
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
792
0
                }
793
0
                q8 += 8;
794
0
            }
795
0
            bsum += sumi * ls;
796
0
        }
797
0
        sumf += d * bsum;
798
0
    }
799
0
    *s = 0.125f * sumf;
800
0
}
801
802
0
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
803
0
    assert(n % QK_K == 0);
804
0
    assert(nrc == 1);
805
0
    UNUSED(nrc);
806
0
    UNUSED(bx);
807
0
    UNUSED(by);
808
0
    UNUSED(bs);
809
810
0
    const block_iq2_xs * GGML_RESTRICT x = vx;
811
0
    const block_q8_K   * GGML_RESTRICT y = vy;
812
813
0
    const int nb = n / QK_K;
814
815
0
    float sumf = 0.f;
816
0
    for (int i = 0; i < nb; ++i) {
817
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
818
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
819
0
        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
820
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
821
0
        int32_t bsum = 0;
822
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
823
0
            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
824
0
            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
825
0
            int32_t sumi = 0;
826
0
            for (int l = 0; l < 2; ++l) {
827
0
                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
828
0
                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
829
0
                for (int j = 0; j < 8; ++j) {
830
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
831
0
                }
832
0
                q8 += 8;
833
0
            }
834
0
            bsum += sumi * ls1;
835
0
            sumi = 0;
836
0
            for (int l = 2; l < 4; ++l) {
837
0
                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
838
0
                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
839
0
                for (int j = 0; j < 8; ++j) {
840
0
                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
841
0
                }
842
0
                q8 += 8;
843
0
            }
844
0
            bsum += sumi * ls2;
845
0
            q2 += 4;
846
0
        }
847
0
        sumf += d * bsum;
848
0
    }
849
0
    *s = 0.125f * sumf;
850
0
}
851
852
0
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
853
0
    assert(n % QK_K == 0);
854
0
    assert(nrc == 1);
855
0
    UNUSED(nrc);
856
0
    UNUSED(bx);
857
0
    UNUSED(by);
858
0
    UNUSED(bs);
859
860
0
    const block_iq2_s * GGML_RESTRICT x = vx;
861
0
    const block_q8_K  * GGML_RESTRICT y = vy;
862
863
0
    const int nb = n / QK_K;
864
865
0
    float sumf = 0;
866
0
    for (int i = 0; i < nb; i++) {
867
868
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
869
0
        const int8_t  * q8 = y[i].qs;
870
0
        const uint8_t * qs = x[i].qs;
871
0
        const uint8_t * qh = x[i].qh;
872
0
        const uint8_t * signs = qs + QK_K/8;
873
874
0
        int bsum = 0;
875
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
876
0
            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
877
0
            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
878
0
            int sumi1 = 0, sumi2 = 0;
879
0
            for (int l = 0; l < 2; ++l) {
880
0
                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
881
0
                for (int j = 0; j < 8; ++j) {
882
0
                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
883
0
                }
884
0
                q8 += 8;
885
0
            }
886
0
            for (int l = 2; l < 4; ++l) {
887
0
                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
888
0
                for (int j = 0; j < 8; ++j) {
889
0
                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
890
0
                }
891
0
                q8 += 8;
892
0
            }
893
0
            bsum += ls1 * sumi1 + ls2 * sumi2;
894
0
            qs += 4;
895
0
            signs += 4;
896
0
        }
897
898
0
        sumf += d * bsum;
899
0
    }
900
901
0
    *s = 0.125f * sumf;
902
0
}
903
904
0
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
905
0
    assert(n % QK_K == 0);
906
0
    assert(nrc == 1);
907
0
    UNUSED(nrc);
908
0
    UNUSED(bx);
909
0
    UNUSED(by);
910
0
    UNUSED(bs);
911
912
0
    const block_iq3_xxs * GGML_RESTRICT x = vx;
913
0
    const block_q8_K    * GGML_RESTRICT y = vy;
914
915
0
    const int nb = n / QK_K;
916
917
0
    uint32_t aux32;
918
919
0
    float sumf = 0.f;
920
0
    for (int i = 0; i < nb; ++i) {
921
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
922
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
923
0
        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
924
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
925
0
        int32_t bsum = 0;
926
0
        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
927
0
            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
928
0
            const uint32_t ls = 2*(aux32 >> 28) + 1;
929
0
            int32_t sumi = 0;
930
0
            for (int l = 0; l < 4; ++l) {
931
0
                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
932
0
                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
933
0
                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
934
0
                for (int j = 0; j < 4; ++j) {
935
0
                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
936
0
                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
937
0
                }
938
0
                q8 += 8;
939
0
            }
940
0
            q3 += 8;
941
0
            bsum += sumi * ls;
942
0
        }
943
0
        sumf += d * bsum;
944
0
    }
945
0
    *s = 0.25f * sumf;
946
0
}
947
948
0
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
949
0
    assert(n % QK_K == 0);
950
0
    assert(nrc == 1);
951
0
    UNUSED(nrc);
952
0
    UNUSED(bx);
953
0
    UNUSED(by);
954
0
    UNUSED(bs);
955
956
0
    const block_iq3_s * GGML_RESTRICT x = vx;
957
0
    const block_q8_K  * GGML_RESTRICT y = vy;
958
959
0
    const int nb = n / QK_K;
960
961
0
    float sumf = 0.f;
962
0
    for (int i = 0; i < nb; ++i) {
963
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
964
0
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
965
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
966
0
        const uint8_t * GGML_RESTRICT signs = x[i].signs;
967
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
968
0
        int32_t bsum = 0;
969
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
970
0
            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
971
0
            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
972
0
            int32_t sumi = 0;
973
0
            for (int l = 0; l < 4; ++l) {
974
0
                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
975
0
                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
976
0
                for (int j = 0; j < 4; ++j) {
977
0
                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
978
0
                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
979
0
                }
980
0
                q8 += 8;
981
0
            }
982
0
            qs += 8;
983
0
            signs += 4;
984
0
            bsum += sumi * ls1;
985
0
            sumi = 0;
986
0
            for (int l = 0; l < 4; ++l) {
987
0
                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
988
0
                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
989
0
                for (int j = 0; j < 4; ++j) {
990
0
                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
991
0
                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
992
0
                }
993
0
                q8 += 8;
994
0
            }
995
0
            qs += 8;
996
0
            signs += 4;
997
0
            bsum += sumi * ls2;
998
0
        }
999
0
        sumf += d * bsum;
1000
0
    }
1001
0
    *s = sumf;
1002
0
}
1003
1004
0
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1005
0
    assert(n % QK_K == 0);
1006
0
    assert(nrc == 1);
1007
0
    UNUSED(nrc);
1008
0
    UNUSED(bx);
1009
0
    UNUSED(by);
1010
0
    UNUSED(bs);
1011
1012
0
    const block_iq1_s * GGML_RESTRICT x = vx;
1013
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1014
1015
0
    const int nb = n / QK_K;
1016
1017
0
    float sumf = 0;
1018
0
    for (int i = 0; i < nb; i++) {
1019
1020
0
        const int8_t   * q8 = y[i].qs;
1021
0
        const uint8_t  * qs = x[i].qs;
1022
0
        const uint16_t * qh = x[i].qh;
1023
1024
0
        int sumi = 0, sumi1 = 0;
1025
0
        for (int ib = 0; ib < QK_K/32; ++ib) {
1026
0
            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
1027
0
            const int delta = qh[ib] & 0x8000 ? -1 : 1;
1028
0
            int lsum = 0;
1029
0
            for (int l = 0; l < 4; ++l) {
1030
0
                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
1031
0
                for (int j = 0; j < 8; ++j) {
1032
0
                    lsum += q8[j] * grid[j];
1033
0
                }
1034
0
                q8 += 8;
1035
0
            }
1036
0
            sumi  += ls * lsum;
1037
0
            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
1038
0
            qs += 4;
1039
0
        }
1040
1041
0
        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1042
0
    }
1043
1044
0
    *s = sumf;
1045
0
}
1046
1047
0
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1048
0
    assert(n % QK_K == 0);
1049
0
    assert(nrc == 1);
1050
0
    UNUSED(nrc);
1051
0
    UNUSED(bx);
1052
0
    UNUSED(by);
1053
0
    UNUSED(bs);
1054
1055
0
    const block_iq1_m * GGML_RESTRICT x = vx;
1056
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1057
1058
0
    const int nb = n / QK_K;
1059
1060
0
    iq1m_scale_t scale;
1061
1062
0
    int sum1[2], sum2[2], delta[4];
1063
1064
0
    float sumf = 0;
1065
0
    for (int i = 0; i < nb; i++) {
1066
1067
0
        const int8_t   * q8 = y[i].qs;
1068
0
        const uint8_t  * qs = x[i].qs;
1069
0
        const uint8_t  * qh = x[i].qh;
1070
0
        const uint16_t * sc = (const uint16_t *)x[i].scales;
1071
1072
0
        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1073
1074
0
        int sumi1 = 0, sumi2 = 0;
1075
0
        for (int ib = 0; ib < QK_K/32; ++ib) {
1076
0
            delta[0] = qh[0] & 0x08 ? -1 : 1;
1077
0
            delta[1] = qh[0] & 0x80 ? -1 : 1;
1078
0
            delta[2] = qh[1] & 0x08 ? -1 : 1;
1079
0
            delta[3] = qh[1] & 0x80 ? -1 : 1;
1080
0
            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
1081
0
            for (int l = 0; l < 4; ++l) {
1082
0
                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
1083
0
                int lsum1 = 0, lsum2 = 0;
1084
0
                for (int j = 0; j < 8; ++j) {
1085
0
                    lsum1 += q8[j] * grid[j];
1086
0
                    lsum2 += q8[j];
1087
0
                }
1088
0
                q8 += 8;
1089
0
                sum1[l/2] += lsum1;
1090
0
                sum2[l/2] += lsum2*delta[l];
1091
0
            }
1092
1093
0
            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
1094
0
            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
1095
1096
0
            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
1097
0
            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
1098
0
            qs += 4;
1099
0
            qh += 2;
1100
0
        }
1101
1102
0
        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1103
0
    }
1104
1105
0
    *s = sumf;
1106
0
}
1107
1108
0
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1109
0
    assert(nrc == 1);
1110
0
    UNUSED(nrc);
1111
0
    UNUSED(bx);
1112
0
    UNUSED(by);
1113
0
    UNUSED(bs);
1114
0
    assert(n % QK4_NL == 0);
1115
0
    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1116
1117
0
    const block_iq4_nl * GGML_RESTRICT x = vx;
1118
0
    const block_q8_0   * GGML_RESTRICT y = vy;
1119
1120
0
    const int nb = n / QK4_NL;
1121
1122
0
    int ib = 0;
1123
0
    float sumf = 0;
1124
1125
0
    for (; ib < nb; ++ib) {
1126
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1127
0
        int sumi1 = 0, sumi2 = 0;
1128
0
        for (int j = 0; j < QK4_NL/2; ++j) {
1129
0
            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1130
0
            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
1131
0
        }
1132
0
        sumf += d * (sumi1 + sumi2);
1133
0
    }
1134
0
    *s = sumf;
1135
0
}
1136
1137
0
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1138
0
    assert(nrc == 1);
1139
0
    UNUSED(nrc);
1140
0
    UNUSED(bx);
1141
0
    UNUSED(by);
1142
0
    UNUSED(bs);
1143
0
    assert(n % QK_K == 0);
1144
1145
0
    const block_iq4_xs * GGML_RESTRICT x = vx;
1146
0
    const block_q8_K   * GGML_RESTRICT y = vy;
1147
1148
0
    const int nb = n / QK_K;
1149
1150
0
    float sumf = 0;
1151
0
    for (int ibl = 0; ibl < nb; ++ibl) {
1152
0
        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1153
0
        uint16_t h = x[ibl].scales_h;
1154
0
        const uint8_t * qs = x[ibl].qs;
1155
0
        const int8_t  * q8 = y[ibl].qs;
1156
0
        for (int ib = 0; ib < QK_K/32; ib += 2) {
1157
0
            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1158
0
            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
1159
0
            h >>= 4;
1160
0
            const float d1 = d4d8*(ls1 - 32);
1161
0
            const float d2 = d4d8*(ls2 - 32);
1162
0
            int sumi1 = 0, sumi2 = 0;
1163
0
            for (int j = 0; j < 16; ++j) {
1164
0
                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1165
0
                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
1166
0
            }
1167
0
            sumf += d1 * (sumi1 + sumi2);
1168
0
            qs += 16;
1169
0
            q8 += 32;
1170
0
            sumi1 = sumi2 = 0;
1171
0
            for (int j = 0; j < 16; ++j) {
1172
0
                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1173
0
                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
1174
0
            }
1175
0
            sumf += d2 * (sumi1 + sumi2);
1176
0
            qs += 16;
1177
0
            q8 += 32;
1178
0
        }
1179
0
    }
1180
0
    *s = sumf;
1181
0
}
1182
1183
// ============================ 4-bit non-linear quants
1184
1185
0
void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1186
0
    assert(k % QK4_NL == 0);
1187
0
    quantize_row_iq4_nl_ref(x, y, k);
1188
0
}
1189
1190
0
void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1191
0
    assert(k % QK_K == 0);
1192
    quantize_iq4_xs(x, y, 1, k, NULL);
1193
0
}