Coverage Report

Created: 2026-01-18 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c
Line
Count
Source
1
#define GGML_COMMON_IMPL_C
2
#include "ggml-common.h"
3
#include "ggml-quants.h"
4
#include "ggml-impl.h"
5
#include "ggml-cpu.h"
6
#include "simd-mappings.h"
7
8
#include "../../quants.h"
9
#include "../../ggml-cpu-impl.h"
10
11
#include <math.h>
12
#include <string.h>
13
#include <assert.h>
14
#include <stdlib.h> // for qsort
15
#include <stdio.h>  // for GGML_ASSERT
16
17
#define GROUP_MAX_EPS 1e-15f
18
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
#define GROUP_MAX_EPS_IQ2_S 1e-8f
20
#define GROUP_MAX_EPS_IQ1_M 1e-7f
21
#define GROUP_MAX_EPS_IQ1_S 1e-12f
22
23
0
#define UNUSED GGML_UNUSED
24
25
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
26
0
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
27
28
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
29
// multiply int8_t, add results pairwise twice
30
0
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
31
0
    // Get absolute values of x vectors
32
0
    const __m128i ax = _mm_sign_epi8(x, x);
33
0
    // Sign the values of the y vectors
34
0
    const __m128i sy = _mm_sign_epi8(y, x);
35
0
    // Perform multiplication and create 16-bit values
36
0
    const __m128i dot = _mm_maddubs_epi16(ax, sy);
37
0
    const __m128i ones = _mm_set1_epi16(1);
38
0
    return _mm_madd_epi16(ones, dot);
39
0
}
40
41
#if __AVX__ || __AVX2__ || __AVX512F__
42
// horizontally add 8 floats
43
0
static inline float hsum_float_8(const __m256 x) {
44
0
    __m128 res = _mm256_extractf128_ps(x, 1);
45
0
    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
46
0
    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
47
0
    res = _mm_add_ss(res, _mm_movehdup_ps(res));
48
0
    return _mm_cvtss_f32(res);
49
0
}
50
51
// horizontally add 8 int32_t
52
0
static inline int hsum_i32_8(const __m256i a) {
53
0
    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
54
0
    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
55
0
    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
56
0
    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
57
0
    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
58
0
}
59
60
// horizontally add 4 int32_t
61
0
static inline int hsum_i32_4(const __m128i a) {
62
0
    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
63
0
    const __m128i sum64 = _mm_add_epi32(hi64, a);
64
0
    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
65
0
    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
66
0
}
67
68
#if defined(__AVX2__) || defined(__AVX512F__)
69
0
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
70
0
    const __m256i ax = _mm256_sign_epi8(x, x);
71
0
    const __m256i sy = _mm256_sign_epi8(y, x);
72
0
    return _mm256_maddubs_epi16(ax, sy);
73
0
}
74
75
// spread 32 bits to 32 bytes { 0x00, 0xFF }
76
0
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
77
0
    uint32_t x32;
78
0
    memcpy(&x32, x, sizeof(uint32_t));
79
0
    const __m256i shuf_mask = _mm256_set_epi64x(
80
0
            0x0303030303030303, 0x0202020202020202,
81
0
            0x0101010101010101, 0x0000000000000000);
82
0
    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
83
0
    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
84
0
    bytes = _mm256_or_si256(bytes, bit_mask);
85
0
    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
86
0
}
87
88
// Unpack 32 4-bit fields into 32 bytes
89
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
90
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
91
0
{
92
0
    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
93
0
    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
94
0
    const __m256i lowMask = _mm256_set1_epi8( 0xF );
95
0
    return _mm256_and_si256(lowMask, bytes);
96
0
}
97
98
// add int16_t pairwise and return as float vector
99
0
static inline __m256 sum_i16_pairs_float(const __m256i x) {
100
0
    const __m256i ones = _mm256_set1_epi16(1);
101
0
    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
102
0
    return _mm256_cvtepi32_ps(summed_pairs);
103
0
}
104
105
0
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
106
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
107
    const __m256i zero = _mm256_setzero_si256();
108
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
109
    return _mm256_cvtepi32_ps(summed_pairs);
110
#elif defined(__AVXVNNI__)
111
    const __m256i zero = _mm256_setzero_si256();
112
    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
113
    return _mm256_cvtepi32_ps(summed_pairs);
114
#else
115
    // Perform multiplication and create 16-bit values
116
0
    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
117
0
    return sum_i16_pairs_float(dot);
118
0
#endif
119
0
}
120
121
// multiply int8_t, add results pairwise twice and return as float vector
122
0
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
123
#if __AVXVNNIINT8__
124
    const __m256i zero = _mm256_setzero_si256();
125
    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
126
    return _mm256_cvtepi32_ps(summed_pairs);
127
#else
128
    // Get absolute values of x vectors
129
0
    const __m256i ax = _mm256_sign_epi8(x, x);
130
    // Sign the values of the y vectors
131
0
    const __m256i sy = _mm256_sign_epi8(y, x);
132
0
    return mul_sum_us8_pairs_float(ax, sy);
133
0
#endif
134
0
}
135
136
static inline __m128i packNibbles( __m256i bytes )
137
0
{
138
0
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
139
0
#if __AVX512F__
140
0
    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
141
0
    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
142
0
    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
143
0
#else
144
0
    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
145
0
    __m256i high = _mm256_andnot_si256( lowByte, bytes );
146
0
    __m256i low = _mm256_and_si256( lowByte, bytes );
147
0
    high = _mm256_srli_epi16( high, 4 );
148
0
    bytes = _mm256_or_si256( low, high );
149
0
150
0
    // Compress uint16_t lanes into bytes
151
0
    __m128i r0 = _mm256_castsi256_si128( bytes );
152
0
    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
153
0
    return _mm_packus_epi16( r0, r1 );
154
0
#endif
155
0
}
156
#elif defined(__AVX__)
157
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
158
{
159
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
160
    const __m128i lowByte = _mm_set1_epi16( 0xFF );
161
    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
162
    __m128i low = _mm_and_si128( lowByte, bytes1 );
163
    high = _mm_srli_epi16( high, 4 );
164
    bytes1 = _mm_or_si128( low, high );
165
    high = _mm_andnot_si128( lowByte, bytes2 );
166
    low = _mm_and_si128( lowByte, bytes2 );
167
    high = _mm_srli_epi16( high, 4 );
168
    bytes2 = _mm_or_si128( low, high );
169
170
    return _mm_packus_epi16( bytes1, bytes2);
171
}
172
173
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
174
    const __m128i ax = _mm_sign_epi8(x, x);
175
    const __m128i sy = _mm_sign_epi8(y, x);
176
    return _mm_maddubs_epi16(ax, sy);
177
}
178
179
// spread 32 bits to 32 bytes { 0x00, 0xFF }
180
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
181
    uint32_t x32;
182
    memcpy(&x32, x, sizeof(uint32_t));
183
    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
184
    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
185
    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
186
    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
187
    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
188
    bytesl = _mm_or_si128(bytesl, bit_mask);
189
    bytesh = _mm_or_si128(bytesh, bit_mask);
190
    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
191
    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
192
    return MM256_SET_M128I(bytesh, bytesl);
193
}
194
195
// Unpack 32 4-bit fields into 32 bytes
196
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
197
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
198
{
199
    // Load 16 bytes from memory
200
    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
201
    __m128i tmph = _mm_srli_epi16(tmpl, 4);
202
    const __m128i lowMask = _mm_set1_epi8(0xF);
203
    tmpl = _mm_and_si128(lowMask, tmpl);
204
    tmph = _mm_and_si128(lowMask, tmph);
205
    return MM256_SET_M128I(tmph, tmpl);
206
}
207
208
// add int16_t pairwise and return as float vector
209
static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
210
    const __m128i ones = _mm_set1_epi16(1);
211
    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
212
    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
213
    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
214
    return _mm256_cvtepi32_ps(summed_pairs);
215
}
216
217
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
218
    const __m128i axl = _mm256_castsi256_si128(ax);
219
    const __m128i axh = _mm256_extractf128_si256(ax, 1);
220
    const __m128i syl = _mm256_castsi256_si128(sy);
221
    const __m128i syh = _mm256_extractf128_si256(sy, 1);
222
    // Perform multiplication and create 16-bit values
223
    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
224
    const __m128i doth = _mm_maddubs_epi16(axh, syh);
225
    return sum_i16_pairs_float(doth, dotl);
226
}
227
228
// multiply int8_t, add results pairwise twice and return as float vector
229
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
230
    const __m128i xl = _mm256_castsi256_si128(x);
231
    const __m128i xh = _mm256_extractf128_si256(x, 1);
232
    const __m128i yl = _mm256_castsi256_si128(y);
233
    const __m128i yh = _mm256_extractf128_si256(y, 1);
234
    // Get absolute values of x vectors
235
    const __m128i axl = _mm_sign_epi8(xl, xl);
236
    const __m128i axh = _mm_sign_epi8(xh, xh);
237
    // Sign the values of the y vectors
238
    const __m128i syl = _mm_sign_epi8(yl, xl);
239
    const __m128i syh = _mm_sign_epi8(yh, xh);
240
    // Perform multiplication and create 16-bit values
241
    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
242
    const __m128i doth = _mm_maddubs_epi16(axh, syh);
243
    return sum_i16_pairs_float(doth, dotl);
244
}
245
246
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
247
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
248
                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
249
    const __m128i mone = _mm_set1_epi16(1);
250
251
    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
252
    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
253
    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
254
    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
255
    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
256
    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
257
    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
258
    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
259
    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
260
    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
261
    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
262
}
263
264
// quad fp16 delta calculation
265
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
266
    // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
267
    return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
268
                           _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
269
}
270
271
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
272
    return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
273
                           _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
274
}
275
#endif
276
#elif defined(__SSSE3__)
277
// horizontally add 4x4 floats
278
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
279
    __m128 res_0 =_mm_hadd_ps(a, b);
280
    __m128 res_1 =_mm_hadd_ps(c, d);
281
    __m128 res =_mm_hadd_ps(res_0, res_1);
282
    res =_mm_hadd_ps(res, res);
283
    res =_mm_hadd_ps(res, res);
284
285
    return _mm_cvtss_f32(res);
286
}
287
#endif // __AVX__ || __AVX2__ || __AVX512F__
288
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
289
290
0
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
291
0
    assert(QK8_0 == 32);
292
0
    assert(k % QK8_0 == 0);
293
0
    const int nb = k / QK8_0;
294
295
0
    block_q8_0 * GGML_RESTRICT y = vy;
296
297
0
#if defined(__AVX2__) || defined(__AVX__)
298
0
    for (int i = 0; i < nb; i++) {
299
        // Load elements into 4 AVX vectors
300
0
        __m256 v0 = _mm256_loadu_ps( x );
301
0
        __m256 v1 = _mm256_loadu_ps( x + 8 );
302
0
        __m256 v2 = _mm256_loadu_ps( x + 16 );
303
0
        __m256 v3 = _mm256_loadu_ps( x + 24 );
304
0
        x += 32;
305
306
        // Compute max(abs(e)) for the block
307
0
        const __m256 signBit = _mm256_set1_ps( -0.0f );
308
0
        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
309
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
310
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
311
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
312
313
0
        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
314
0
        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
315
0
        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
316
0
        const float maxScalar = _mm_cvtss_f32( max4 );
317
318
        // Quantize these floats
319
0
        const float d = maxScalar / 127.f;
320
0
        y[i].d = GGML_CPU_FP32_TO_FP16(d);
321
0
        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
322
0
        const __m256 mul = _mm256_set1_ps( id );
323
324
        // Apply the multiplier
325
0
        v0 = _mm256_mul_ps( v0, mul );
326
0
        v1 = _mm256_mul_ps( v1, mul );
327
0
        v2 = _mm256_mul_ps( v2, mul );
328
0
        v3 = _mm256_mul_ps( v3, mul );
329
330
        // Round to nearest integer
331
0
        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
332
0
        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
333
0
        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
334
0
        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
335
336
        // Convert floats to integers
337
0
        __m256i i0 = _mm256_cvtps_epi32( v0 );
338
0
        __m256i i1 = _mm256_cvtps_epi32( v1 );
339
0
        __m256i i2 = _mm256_cvtps_epi32( v2 );
340
0
        __m256i i3 = _mm256_cvtps_epi32( v3 );
341
342
0
#if defined(__AVX2__)
343
        // Convert int32 to int16
344
0
        i0 = _mm256_packs_epi32( i0, i1 );  // 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
345
0
        i2 = _mm256_packs_epi32( i2, i3 );  // 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
346
                                            // Convert int16 to int8
347
0
        i0 = _mm256_packs_epi16( i0, i2 );  // 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
348
349
        // We got our precious signed bytes, but the order is now wrong
350
        // These AVX2 pack instructions process 16-byte pieces independently
351
        // The following instruction is fixing the order
352
0
        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
353
0
        i0 = _mm256_permutevar8x32_epi32( i0, perm );
354
355
0
        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
356
#else
357
        // Since we don't have in AVX some necessary functions,
358
        // we split the registers in half and call AVX2 analogs from SSE
359
        __m128i ni0 = _mm256_castsi256_si128( i0 );
360
        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
361
        __m128i ni2 = _mm256_castsi256_si128( i1 );
362
        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
363
        __m128i ni4 = _mm256_castsi256_si128( i2 );
364
        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
365
        __m128i ni6 = _mm256_castsi256_si128( i3 );
366
        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
367
368
        // Convert int32 to int16
369
        ni0 = _mm_packs_epi32( ni0, ni1 );
370
        ni2 = _mm_packs_epi32( ni2, ni3 );
371
        ni4 = _mm_packs_epi32( ni4, ni5 );
372
        ni6 = _mm_packs_epi32( ni6, ni7 );
373
        // Convert int16 to int8
374
        ni0 = _mm_packs_epi16( ni0, ni2 );
375
        ni4 = _mm_packs_epi16( ni4, ni6 );
376
377
        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
378
        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
379
#endif
380
0
    }
381
#else
382
    GGML_UNUSED(nb);
383
    // scalar
384
    quantize_row_q8_0_ref(x, y, k);
385
#endif
386
0
}
387
388
0
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
389
0
    assert(k % QK8_1 == 0);
390
0
    const int nb = k / QK8_1;
391
392
0
    block_q8_1 * GGML_RESTRICT y = vy;
393
0
#if defined(__AVX2__) || defined(__AVX__)
394
0
    for (int i = 0; i < nb; i++) {
395
        // Load elements into 4 AVX vectors
396
0
        __m256 v0 = _mm256_loadu_ps( x );
397
0
        __m256 v1 = _mm256_loadu_ps( x + 8 );
398
0
        __m256 v2 = _mm256_loadu_ps( x + 16 );
399
0
        __m256 v3 = _mm256_loadu_ps( x + 24 );
400
0
        x += 32;
401
402
        // Compute max(abs(e)) for the block
403
0
        const __m256 signBit = _mm256_set1_ps( -0.0f );
404
0
        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
405
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
406
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
407
0
        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
408
409
0
        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
410
0
        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
411
0
        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
412
0
        const float max_scalar = _mm_cvtss_f32( max4 );
413
414
        // Quantize these floats
415
0
        const float d = max_scalar / 127.f;
416
0
        y[i].d = GGML_CPU_FP32_TO_FP16(d);
417
0
        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
418
0
        const __m256 mul = _mm256_set1_ps( id );
419
420
        // Apply the multiplier
421
0
        v0 = _mm256_mul_ps( v0, mul );
422
0
        v1 = _mm256_mul_ps( v1, mul );
423
0
        v2 = _mm256_mul_ps( v2, mul );
424
0
        v3 = _mm256_mul_ps( v3, mul );
425
426
        // Round to nearest integer
427
0
        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
428
0
        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
429
0
        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
430
0
        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
431
432
        // Convert floats to integers
433
0
        __m256i i0 = _mm256_cvtps_epi32( v0 );
434
0
        __m256i i1 = _mm256_cvtps_epi32( v1 );
435
0
        __m256i i2 = _mm256_cvtps_epi32( v2 );
436
0
        __m256i i3 = _mm256_cvtps_epi32( v3 );
437
438
0
#if defined(__AVX2__)
439
        // Compute the sum of the quants and set y[i].s
440
0
        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
441
442
        // Convert int32 to int16
443
0
        i0 = _mm256_packs_epi32( i0, i1 );  // 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
444
0
        i2 = _mm256_packs_epi32( i2, i3 );  // 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
445
                                            // Convert int16 to int8
446
0
        i0 = _mm256_packs_epi16( i0, i2 );  // 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
447
448
        // We got our precious signed bytes, but the order is now wrong
449
        // These AVX2 pack instructions process 16-byte pieces independently
450
        // The following instruction is fixing the order
451
0
        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
452
0
        i0 = _mm256_permutevar8x32_epi32( i0, perm );
453
454
0
        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
455
#else
456
        // Since we don't have in AVX some necessary functions,
457
        // we split the registers in half and call AVX2 analogs from SSE
458
        __m128i ni0 = _mm256_castsi256_si128( i0 );
459
        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
460
        __m128i ni2 = _mm256_castsi256_si128( i1 );
461
        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
462
        __m128i ni4 = _mm256_castsi256_si128( i2 );
463
        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
464
        __m128i ni6 = _mm256_castsi256_si128( i3 );
465
        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
466
467
        // Compute the sum of the quants and set y[i].s
468
        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
469
        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
470
        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
471
472
        // Convert int32 to int16
473
        ni0 = _mm_packs_epi32( ni0, ni1 );
474
        ni2 = _mm_packs_epi32( ni2, ni3 );
475
        ni4 = _mm_packs_epi32( ni4, ni5 );
476
        ni6 = _mm_packs_epi32( ni6, ni7 );
477
        // Convert int16 to int8
478
        ni0 = _mm_packs_epi16( ni0, ni2 );
479
        ni4 = _mm_packs_epi16( ni4, ni6 );
480
481
        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
482
        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
483
#endif
484
0
    }
485
#else
486
    GGML_UNUSED(nb);
487
    // scalar
488
    quantize_row_q8_1_ref(x, y, k);
489
#endif
490
0
}
491
492
// placeholder implementation for Apple targets
493
0
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
494
0
    quantize_row_q8_K_ref(x, y, k);
495
0
}
496
497
//===================================== Dot products =================================
498
499
//
500
// Helper functions
501
//
502
503
#if __AVX__ || __AVX2__ || __AVX512F__
504
505
// shuffles to pick the required scales in dot products
506
0
static inline __m256i get_scale_shuffle_q3k(int i) {
507
0
    static const uint8_t k_shuffle[128] = {
508
0
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
509
0
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
510
0
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
511
0
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
512
0
    };
513
0
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
514
0
}
515
0
static inline __m256i get_scale_shuffle_k4(int i) {
516
0
    static const uint8_t k_shuffle[256] = {
517
0
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
518
0
         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
519
0
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
520
0
         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
521
0
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
522
0
        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
523
0
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
524
0
        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
525
0
    };
526
0
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
527
0
}
528
0
static inline __m128i get_scale_shuffle(int i) {
529
0
    static const uint8_t k_shuffle[128] = {
530
0
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
531
0
         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
532
0
         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
533
0
         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
534
0
         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
535
0
        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
536
0
        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
537
0
        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
538
0
    };
539
0
    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
540
0
}
541
#endif
542
543
0
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
544
0
    const int qk = QK8_0;
545
0
    const int nb = n / qk;
546
547
0
    assert(n % qk == 0);
548
0
    assert(nrc == 1);
549
0
    UNUSED(nrc);
550
0
    UNUSED(bx);
551
0
    UNUSED(by);
552
0
    UNUSED(bs);
553
554
0
    const block_q4_0 * GGML_RESTRICT x = vx;
555
0
    const block_q8_0 * GGML_RESTRICT y = vy;
556
557
0
    int ib = 0;
558
0
    float sumf = 0;
559
560
0
#if defined(__AVX2__)
561
    // Initialize accumulator with zeros
562
0
    __m256 acc = _mm256_setzero_ps();
563
564
    // Main loop
565
0
    for (; ib < nb; ++ib) {
566
        /* Compute combined scale for the block */
567
0
        const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
568
569
0
        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
570
571
        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
572
0
        const __m256i off = _mm256_set1_epi8( 8 );
573
0
        qx = _mm256_sub_epi8( qx, off );
574
575
0
        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
576
577
0
        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
578
579
        /* Multiply q with scale and accumulate */
580
0
        acc = _mm256_fmadd_ps( d, q, acc );
581
0
    }
582
583
0
    sumf = hsum_float_8(acc);
584
#elif defined(__AVX__)
585
    __m256 accum = _mm256_setzero_ps();
586
    for (; ib + 1 < nb; ib += 2) {
587
        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
588
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
589
        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
590
        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
591
        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
592
        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
593
594
        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
595
        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
596
        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
597
        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
598
599
        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
600
        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
601
        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
602
        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
603
        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
604
        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
605
        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
606
607
        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
608
        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
609
    }
610
611
    sumf = hsum_float_8(accum);
612
#elif defined(__SSSE3__)
613
    // set constants
614
    const __m128i lowMask = _mm_set1_epi8(0xF);
615
    const __m128i off = _mm_set1_epi8(8);
616
617
    // Initialize accumulator with zeros
618
    __m128 acc_0 = _mm_setzero_ps();
619
    __m128 acc_1 = _mm_setzero_ps();
620
    __m128 acc_2 = _mm_setzero_ps();
621
    __m128 acc_3 = _mm_setzero_ps();
622
623
    for (; ib + 1 < nb; ib += 2) {
624
        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
625
        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
626
627
        // Compute combined scale for the block 0 and 1
628
        const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
629
630
        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
631
632
        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
633
        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
634
        bx_0 = _mm_sub_epi8(bx_0, off);
635
        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
636
637
        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
638
        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
639
        bx_1 = _mm_sub_epi8(bx_1, off);
640
        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
641
642
        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
643
        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
644
645
        // Compute combined scale for the block 2 and 3
646
        const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
647
648
        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
649
650
        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
651
        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
652
        bx_2 = _mm_sub_epi8(bx_2, off);
653
        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
654
655
        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
656
        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
657
        bx_3 = _mm_sub_epi8(bx_3, off);
658
        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
659
660
        // Convert int32_t to float
661
        __m128 p0 = _mm_cvtepi32_ps(i32_0);
662
        __m128 p1 = _mm_cvtepi32_ps(i32_1);
663
        __m128 p2 = _mm_cvtepi32_ps(i32_2);
664
        __m128 p3 = _mm_cvtepi32_ps(i32_3);
665
666
        // Apply the scale
667
        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
668
        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
669
        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
670
        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
671
672
        // Acummulate
673
        acc_0 = _mm_add_ps(p0_d, acc_0);
674
        acc_1 = _mm_add_ps(p1_d, acc_1);
675
        acc_2 = _mm_add_ps(p2_d, acc_2);
676
        acc_3 = _mm_add_ps(p3_d, acc_3);
677
    }
678
679
    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
680
681
#endif
682
0
    for (; ib < nb; ++ib) {
683
0
        int sumi0 = 0;
684
0
        int sumi1 = 0;
685
686
0
        for (int j = 0; j < qk/2; ++j) {
687
0
            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
688
0
            const int v1 = (x[ib].qs[j] >>   4) - 8;
689
690
0
            sumi0 += (v0 * y[ib].qs[j]);
691
0
            sumi1 += (v1 * y[ib].qs[j + qk/2]);
692
0
        }
693
694
0
        int sumi = sumi0 + sumi1;
695
0
        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
696
0
    }
697
698
0
    *s = sumf;
699
0
}
700
701
0
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
702
0
    const int qk = QK8_1;
703
0
    const int nb = n / qk;
704
705
0
    assert(n % qk == 0);
706
0
    assert(nrc == 1);
707
0
    UNUSED(nrc);
708
0
    UNUSED(bx);
709
0
    UNUSED(by);
710
0
    UNUSED(bs);
711
712
0
    const block_q4_1 * GGML_RESTRICT x = vx;
713
0
    const block_q8_1 * GGML_RESTRICT y = vy;
714
715
0
    int ib = 0;
716
717
0
#if defined(__AVX2__) || defined(__AVX__)
718
    // Initialize accumulator with zeros
719
0
    __m256 acc = _mm256_setzero_ps();
720
721
0
    float summs = 0;
722
723
    // Main loop
724
0
    for (; ib < nb; ++ib) {
725
0
        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
726
0
        const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
727
728
0
        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
729
730
0
        const __m256 d0v = _mm256_set1_ps( d0 );
731
0
        const __m256 d1v = _mm256_set1_ps( d1 );
732
733
        // Compute combined scales
734
0
        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
735
736
        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
737
0
        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
738
0
        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
739
740
0
        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
741
742
        // Accumulate d0*d1*x*y
743
0
#if defined(__AVX2__)
744
0
        acc = _mm256_fmadd_ps( d0d1, xy, acc );
745
#else
746
        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
747
#endif
748
0
    }
749
750
0
    *s = hsum_float_8(acc) + summs;
751
#else
752
    UNUSED(nb);
753
    UNUSED(x);
754
    UNUSED(y);
755
    UNUSED(ib);
756
    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
757
#endif
758
0
}
759
760
0
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
761
0
    assert(nrc == 1);
762
0
    UNUSED(nrc);
763
0
    UNUSED(bx);
764
0
    UNUSED(by);
765
0
    UNUSED(bs);
766
0
    assert(n % QK_MXFP4 == 0);
767
0
    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
768
769
0
    const block_mxfp4 * GGML_RESTRICT x = vx;
770
0
    const block_q8_0 * GGML_RESTRICT y = vy;
771
772
0
    const int nb = n / QK_MXFP4;
773
774
0
    int ib = 0;
775
0
    float sumf = 0;
776
777
0
#if defined __AVX2__
778
779
0
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
780
0
    const __m128i m4b  = _mm_set1_epi8(0x0f);
781
0
    const __m256i mone = _mm256_set1_epi16(1);
782
783
0
    __m256 accum1 = _mm256_setzero_ps();
784
0
    __m256 accum2 = _mm256_setzero_ps();
785
0
    for (; ib + 1 < nb; ib += 2) {
786
0
        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
787
0
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
788
0
        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
789
0
        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
790
0
        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
791
0
                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
792
0
        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
793
0
                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
794
0
        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
795
0
        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
796
0
        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
797
0
        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
798
0
        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
799
0
                _mm256_cvtepi32_ps(p_1), accum1);
800
0
        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
801
0
                _mm256_cvtepi32_ps(p_2), accum2);
802
0
    }
803
804
0
    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
805
806
#elif defined __AVX__
807
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
808
    const __m128i m4b  = _mm_set1_epi8(0x0f);
809
810
    __m256 accum = _mm256_setzero_ps();
811
    for (; ib + 1 < nb; ib += 2) {
812
        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
813
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
814
        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
815
        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
816
        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
817
        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
818
819
        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
820
        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
821
        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
822
        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
823
824
        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
825
        const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
826
        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
827
    }
828
829
    sumf = hsum_float_8(accum);
830
831
#endif
832
0
    for (; ib < nb; ++ib) {
833
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
834
0
        int sumi1 = 0;
835
0
        int sumi2 = 0;
836
0
        for (int j = 0; j < QK_MXFP4/2; ++j) {
837
0
            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
838
0
            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
839
0
        }
840
0
        sumf += d * (sumi1 + sumi2);
841
0
    }
842
0
    *s = sumf;
843
0
}
844
845
0
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
846
0
    const int qk = QK8_0;
847
0
    const int nb = n / qk;
848
849
0
    int ib = 0;
850
851
0
    assert(n % qk == 0);
852
0
    assert(qk == QK5_0);
853
0
    assert(nrc == 1);
854
0
    UNUSED(nrc);
855
0
    UNUSED(bx);
856
0
    UNUSED(by);
857
0
    UNUSED(bs);
858
859
0
    const block_q5_0 * GGML_RESTRICT x = vx;
860
0
    const block_q8_0 * GGML_RESTRICT y = vy;
861
862
0
#if defined(__AVX2__)
863
    // Initialize accumulator with zeros
864
0
    __m256 acc = _mm256_setzero_ps();
865
866
    // Main loop
867
0
    for (; ib < nb; ++ib) {
868
        /* Compute combined scale for the block */
869
0
        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
870
871
0
        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
872
0
        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
873
0
        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
874
0
        qx = _mm256_or_si256(qx, bxhi);
875
876
0
        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
877
878
0
        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
879
880
        /* Multiply q with scale and accumulate */
881
0
        acc = _mm256_fmadd_ps(d, q, acc);
882
0
    }
883
884
0
    *s = hsum_float_8(acc);
885
#elif defined(__AVX__)
886
    // Initialize accumulator with zeros
887
    __m256 acc = _mm256_setzero_ps();
888
    __m128i mask = _mm_set1_epi8((char)0xF0);
889
890
    // Main loop
891
    for (; ib < nb; ++ib) {
892
        /* Compute combined scale for the block */
893
        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
894
895
        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
896
        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
897
        __m128i bxhil = _mm256_castsi256_si128(bxhi);
898
        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
899
        bxhil = _mm_andnot_si128(bxhil, mask);
900
        bxhih = _mm_andnot_si128(bxhih, mask);
901
        __m128i bxl = _mm256_castsi256_si128(bx_0);
902
        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
903
        bxl = _mm_or_si128(bxl, bxhil);
904
        bxh = _mm_or_si128(bxh, bxhih);
905
        bx_0 = MM256_SET_M128I(bxh, bxl);
906
907
        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
908
909
        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
910
911
        /* Multiply q with scale and accumulate */
912
        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
913
    }
914
915
    *s = hsum_float_8(acc);
916
#else
917
    UNUSED(nb);
918
    UNUSED(ib);
919
    UNUSED(x);
920
    UNUSED(y);
921
    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
922
#endif
923
0
}
924
925
0
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
926
0
    const int qk = QK8_1;
927
0
    const int nb = n / qk;
928
929
0
    int ib = 0;
930
931
0
    assert(n % qk == 0);
932
0
    assert(qk == QK5_1);
933
0
    assert(nrc == 1);
934
0
    UNUSED(nrc);
935
0
    UNUSED(bx);
936
0
    UNUSED(by);
937
0
    UNUSED(bs);
938
939
0
    const block_q5_1 * GGML_RESTRICT x = vx;
940
0
    const block_q8_1 * GGML_RESTRICT y = vy;
941
942
0
#if defined(__AVX2__)
943
    // Initialize accumulator with zeros
944
0
    __m256 acc = _mm256_setzero_ps();
945
946
0
    float summs = 0.0f;
947
948
    // Main loop
949
0
    for (; ib < nb; ++ib) {
950
0
        const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
951
952
0
        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
953
954
0
        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
955
0
        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
956
0
        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
957
0
        qx = _mm256_or_si256(qx, bxhi);
958
959
0
        const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
960
0
        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
961
962
0
        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
963
964
0
        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
965
0
    }
966
967
0
    *s = hsum_float_8(acc) + summs;
968
#elif defined(__AVX__)
969
    // Initialize accumulator with zeros
970
    __m256 acc = _mm256_setzero_ps();
971
    __m128i mask = _mm_set1_epi8(0x10);
972
973
    float summs = 0.0f;
974
975
    // Main loop
976
    for (; ib < nb; ++ib) {
977
        const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
978
979
        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
980
981
        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
982
        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
983
        __m128i bxhil = _mm256_castsi256_si128(bxhi);
984
        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
985
        bxhil = _mm_and_si128(bxhil, mask);
986
        bxhih = _mm_and_si128(bxhih, mask);
987
        __m128i bxl = _mm256_castsi256_si128(bx_0);
988
        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
989
        bxl = _mm_or_si128(bxl, bxhil);
990
        bxh = _mm_or_si128(bxh, bxhih);
991
        bx_0 = MM256_SET_M128I(bxh, bxl);
992
993
        const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
994
        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
995
996
        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
997
998
        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
999
    }
1000
1001
    *s = hsum_float_8(acc) + summs;
1002
#else
1003
    UNUSED(nb);
1004
    UNUSED(ib);
1005
    UNUSED(x);
1006
    UNUSED(y);
1007
    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
1008
#endif
1009
0
}
1010
1011
0
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1012
0
    const int qk = QK8_0;
1013
0
    const int nb = n / qk;
1014
1015
0
    assert(n % qk == 0);
1016
0
    assert(nrc == 1);
1017
0
    UNUSED(nrc);
1018
0
    UNUSED(bx);
1019
0
    UNUSED(by);
1020
0
    UNUSED(bs);
1021
1022
0
    const block_q8_0 * GGML_RESTRICT x = vx;
1023
0
    const block_q8_0 * GGML_RESTRICT y = vy;
1024
1025
0
    int ib = 0;
1026
0
    float sumf = 0;
1027
1028
0
#if defined(__AVX2__)
1029
    // Initialize accumulator with zeros
1030
0
    __m256 acc = _mm256_setzero_ps();
1031
1032
    // Main loop
1033
0
    for (; ib < nb; ++ib) {
1034
        // Compute combined scale for the block
1035
0
        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
1036
0
        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
1037
0
        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
1038
1039
0
        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
1040
1041
        // Multiply q with scale and accumulate
1042
0
        acc = _mm256_fmadd_ps( d, q, acc );
1043
0
    }
1044
1045
0
    sumf = hsum_float_8(acc);
1046
#elif defined(__AVX__)
1047
    __m256 accum = _mm256_setzero_ps();
1048
1049
    for (; ib + 1 < nb; ib += 2) {
1050
        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
1051
        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
1052
        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
1053
        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
1054
        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
1055
        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
1056
        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
1057
        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
1058
1059
        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
1060
        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
1061
        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
1062
    }
1063
1064
    sumf = hsum_float_8(accum);
1065
#endif
1066
0
    for (; ib < nb; ++ib) {
1067
0
        int sumi = 0;
1068
1069
0
        for (int j = 0; j < qk; j++) {
1070
0
            sumi += x[ib].qs[j]*y[ib].qs[j];
1071
0
        }
1072
1073
0
        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1074
0
    }
1075
1076
0
    *s = sumf;
1077
0
}
1078
1079
0
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1080
0
    assert(nrc == 1);
1081
0
    UNUSED(nrc);
1082
0
    UNUSED(bx);
1083
0
    UNUSED(by);
1084
0
    UNUSED(bs);
1085
1086
0
    const block_tq1_0 * GGML_RESTRICT x = vx;
1087
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1088
1089
0
    const int nb = n / QK_K;
1090
1091
0
#if defined(__AVX2__)
1092
0
    __m256 sumf = _mm256_setzero_ps();
1093
1094
0
    for (int i = 0; i < nb; ++i) {
1095
        // 16-bit sums
1096
0
        __m256i sumi0 = _mm256_setzero_si256();
1097
0
        __m256i sumi1 = _mm256_setzero_si256();
1098
0
        __m256i sumi2 = _mm256_setzero_si256();
1099
1100
        // first 32 bytes of 5 elements
1101
0
        {
1102
0
            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
1103
            // 8-bit multiplies with shifts, masks and adds
1104
0
            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
1105
0
            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
1106
0
            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
1107
0
            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
1108
1109
            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
1110
1111
            // Cancel the +1 from avg so that it behaves like a halving add
1112
0
            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
1113
0
            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
1114
0
            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
1115
0
            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
1116
0
            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
1117
            // Multiply by 3 and get the top 2 bits
1118
0
            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
1119
0
            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
1120
0
            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
1121
0
            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
1122
0
            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
1123
0
            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
1124
0
            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
1125
0
            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
1126
0
            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
1127
0
            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
1128
1129
0
            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
1130
0
            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
1131
0
            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
1132
0
            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
1133
0
            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
1134
1135
0
            qx0 = _mm256_maddubs_epi16(qx0, qy0);
1136
0
            qx1 = _mm256_maddubs_epi16(qx1, qy1);
1137
0
            qx2 = _mm256_maddubs_epi16(qx2, qy2);
1138
0
            qx3 = _mm256_maddubs_epi16(qx3, qy3);
1139
0
            qx4 = _mm256_maddubs_epi16(qx4, qy4);
1140
1141
0
            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
1142
0
            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
1143
0
            sumi2 = _mm256_add_epi16(sumi2, qx4);
1144
0
        }
1145
1146
        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
1147
0
        {
1148
0
            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
1149
0
            uint32_t qh;
1150
0
            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
1151
0
            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
1152
0
            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
1153
0
            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
1154
0
            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
1155
0
            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
1156
0
            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
1157
0
            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
1158
1159
            // avx2 does not have 8-bit multiplies, so 16-bit it is.
1160
0
            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
1161
0
            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
1162
0
            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
1163
1164
0
            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
1165
1166
            // Cancel the +1 from avg so that it behaves like a halving add
1167
0
            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
1168
0
            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
1169
0
            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
1170
            // Multiply by 3 and get the top 2 bits
1171
0
            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
1172
0
            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
1173
0
            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
1174
0
            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
1175
0
            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
1176
0
            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
1177
1178
0
            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
1179
0
            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
1180
0
            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
1181
1182
0
            qx01 = _mm256_maddubs_epi16(qx01, qy01);
1183
0
            qx23 = _mm256_maddubs_epi16(qx23, qy23);
1184
0
            qx45 = _mm256_maddubs_epi16(qx45, qy45);
1185
1186
0
            sumi0 = _mm256_add_epi16(sumi0, qx01);
1187
0
            sumi1 = _mm256_add_epi16(sumi1, qx23);
1188
0
            sumi2 = _mm256_add_epi16(sumi2, qx45);
1189
0
        }
1190
1191
0
        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1192
0
        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
1193
1194
0
        sumi0 = _mm256_sub_epi16(sumi0, ysum);
1195
0
        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
1196
0
        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
1197
1198
0
        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
1199
0
    }
1200
1201
0
    *s = hsum_float_8(sumf);
1202
1203
#else
1204
    UNUSED(x);
1205
    UNUSED(y);
1206
    UNUSED(nb);
1207
    ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1208
#endif
1209
0
}
1210
1211
0
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1212
0
    assert(nrc == 1);
1213
0
    UNUSED(nrc);
1214
0
    UNUSED(bx);
1215
0
    UNUSED(by);
1216
0
    UNUSED(bs);
1217
1218
0
    const block_tq2_0 * GGML_RESTRICT x = vx;
1219
0
    const block_q8_K  * GGML_RESTRICT y = vy;
1220
1221
0
    const int nb = n / QK_K;
1222
1223
0
#if defined(__AVX2__)
1224
0
    __m256 sumf = _mm256_setzero_ps();
1225
1226
0
    for (int i = 0; i < nb; ++i) {
1227
        // 16-bit sums, because 256*127 still fits
1228
0
        __m256i sumi0 = _mm256_setzero_si256();
1229
0
        __m256i sumi1 = _mm256_setzero_si256();
1230
1231
0
        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1232
0
            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
1233
0
            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
1234
0
            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
1235
0
            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
1236
1237
            // 0, 1, 2 (should not be 3)
1238
0
            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
1239
0
            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
1240
0
            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
1241
0
            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
1242
1243
0
            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
1244
0
            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
1245
0
            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
1246
0
            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
1247
1248
0
            qx0 = _mm256_maddubs_epi16(qx0, qy0);
1249
0
            qx1 = _mm256_maddubs_epi16(qx1, qy1);
1250
0
            qx2 = _mm256_maddubs_epi16(qx2, qy2);
1251
0
            qx3 = _mm256_maddubs_epi16(qx3, qy3);
1252
1253
0
            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
1254
0
            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
1255
0
        }
1256
1257
0
        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1258
0
        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
1259
1260
0
        sumi0 = _mm256_add_epi16(sumi0, sumi1);
1261
0
        sumi0 = _mm256_sub_epi16(sumi0, ysum);
1262
0
        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
1263
1264
0
        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
1265
0
    }
1266
1267
0
    *s = hsum_float_8(sumf);
1268
1269
#else
1270
    UNUSED(x);
1271
    UNUSED(y);
1272
    UNUSED(nb);
1273
    ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1274
#endif
1275
0
}
1276
1277
0
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1278
0
    assert(nrc == 1);
1279
0
    UNUSED(nrc);
1280
0
    UNUSED(bx);
1281
0
    UNUSED(by);
1282
0
    UNUSED(bs);
1283
1284
0
    const block_q2_K * GGML_RESTRICT x = vx;
1285
0
    const block_q8_K * GGML_RESTRICT y = vy;
1286
1287
0
    const int nb = n / QK_K;
1288
1289
0
#if defined __AVX2__
1290
1291
0
    const __m256i m3 = _mm256_set1_epi8(3);
1292
0
    const __m128i m4 = _mm_set1_epi8(0xF);
1293
1294
0
    __m256 acc = _mm256_setzero_ps();
1295
1296
0
    for (int i = 0; i < nb; ++i) {
1297
1298
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1299
0
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1300
1301
0
        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1302
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1303
1304
0
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
1305
0
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
1306
0
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
1307
0
        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
1308
0
        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
1309
1310
0
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
1311
1312
0
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1313
0
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1314
0
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1315
0
        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1316
1317
0
        __m256i sumi = _mm256_setzero_si256();
1318
1319
0
        for (int j = 0; j < QK_K/128; ++j) {
1320
1321
0
            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
1322
1323
0
            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1324
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1325
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1326
0
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1327
1328
0
            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
1329
0
            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
1330
0
            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
1331
0
            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
1332
1333
0
            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
1334
0
            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
1335
0
            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
1336
0
            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
1337
1338
0
            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
1339
0
            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
1340
0
            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
1341
0
            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
1342
1343
0
            p0 = _mm256_add_epi32(p0, p1);
1344
0
            p2 = _mm256_add_epi32(p2, p3);
1345
1346
0
            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
1347
0
        }
1348
1349
0
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
1350
1351
0
    }
1352
1353
0
    *s = hsum_float_8(acc);
1354
1355
#elif defined __AVX__
1356
1357
    const __m128i m3 = _mm_set1_epi8(0x3);
1358
    const __m128i m4 = _mm_set1_epi8(0xF);
1359
    const __m128i m2 = _mm_set1_epi8(0x2);
1360
1361
    __m256 acc = _mm256_setzero_ps();
1362
1363
    for (int i = 0; i < nb; ++i) {
1364
1365
        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1366
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1367
1368
        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1369
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1370
1371
        // load mins and scales from block_q2_K.scales[QK_K/16]
1372
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
1373
        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
1374
        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
1375
        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
1376
        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
1377
1378
        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
1379
        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
1380
        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1381
1382
        // sumf += -dmin * summs in 32bits*8
1383
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1384
1385
        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1386
        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
1387
        const __m128i scales[2] = { scales_0, scales_1 };
1388
1389
        __m128i sumi_0 = _mm_setzero_si128();
1390
        __m128i sumi_1 = _mm_setzero_si128();
1391
1392
        for (int j = 0; j < QK_K/128; ++j) {
1393
1394
            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
1395
            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1396
            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1397
            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1398
            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1399
            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1400
            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1401
            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1402
            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1403
1404
            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
1405
            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1406
            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
1407
            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1408
            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1409
            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1410
            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1411
            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
1412
            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1413
            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1414
            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1415
1416
            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
1417
            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
1418
            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
1419
            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
1420
            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
1421
            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
1422
            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
1423
            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
1424
            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
1425
1426
            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
1427
            __m128i shuffle = _mm_set1_epi16(0x0100);
1428
            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
1429
            shuffle = _mm_add_epi16(shuffle, m2);
1430
            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
1431
            shuffle = _mm_add_epi16(shuffle, m2);
1432
            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
1433
            shuffle = _mm_add_epi16(shuffle, m2);
1434
            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
1435
            shuffle = _mm_add_epi16(shuffle, m2);
1436
            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
1437
            shuffle = _mm_add_epi16(shuffle, m2);
1438
            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
1439
            shuffle = _mm_add_epi16(shuffle, m2);
1440
            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
1441
            shuffle = _mm_add_epi16(shuffle, m2);
1442
            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
1443
1444
            p0 = _mm_add_epi32(p0, p1);
1445
            p2 = _mm_add_epi32(p2, p3);
1446
            p4 = _mm_add_epi32(p4, p5);
1447
            p6 = _mm_add_epi32(p6, p7);
1448
1449
            // isum in 32bits*4*2
1450
            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
1451
            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
1452
        }
1453
1454
        // sumf += dall * isum - dmin * summs in 32bits
1455
        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1456
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1457
    }
1458
1459
    *s = hsum_float_8(acc);
1460
1461
#else
1462
    UNUSED(x);
1463
    UNUSED(y);
1464
    UNUSED(nb);
1465
    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1466
#endif
1467
0
}
1468
1469
0
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1470
0
    assert(n % QK_K == 0);
1471
0
    assert(nrc == 1);
1472
0
    UNUSED(nrc);
1473
0
    UNUSED(bx);
1474
0
    UNUSED(by);
1475
0
    UNUSED(bs);
1476
1477
0
    const uint32_t kmask1 = 0x03030303;
1478
0
    const uint32_t kmask2 = 0x0f0f0f0f;
1479
1480
0
    const block_q3_K * GGML_RESTRICT x = vx;
1481
0
    const block_q8_K * GGML_RESTRICT y = vy;
1482
1483
0
    const int nb = n / QK_K;
1484
1485
0
#if defined __AVX2__
1486
1487
0
    const __m256i m3 = _mm256_set1_epi8(3);
1488
0
    const __m256i mone = _mm256_set1_epi8(1);
1489
0
    const __m128i m32 = _mm_set1_epi8(32);
1490
1491
0
    __m256 acc = _mm256_setzero_ps();
1492
1493
0
    uint32_t aux[3];
1494
1495
0
    for (int i = 0; i < nb; ++i) {
1496
1497
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1498
1499
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1500
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1501
1502
        // Set up scales
1503
0
        memcpy(aux, x[i].scales, 12);
1504
0
        __m128i scales128 = _mm_set_epi32(
1505
0
                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1506
0
                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1507
0
                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1508
0
                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1509
0
        scales128 = _mm_sub_epi8(scales128, m32);
1510
0
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1511
0
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1512
0
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1513
0
        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1514
1515
        // high bit
1516
0
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
1517
1518
        // integer accumulator
1519
0
        __m256i sumi = _mm256_setzero_si256();
1520
1521
0
        int bit = 0;
1522
0
        int is  = 0;
1523
1524
0
        for (int j = 0; j < QK_K/128; ++j) {
1525
            // load low 2 bits
1526
0
            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
1527
1528
            // prepare low and high bits
1529
0
            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
1530
0
            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1531
0
            ++bit;
1532
1533
0
            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
1534
0
            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1535
0
            ++bit;
1536
1537
0
            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
1538
0
            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1539
0
            ++bit;
1540
1541
0
            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
1542
0
            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1543
0
            ++bit;
1544
1545
            // load Q8 quants
1546
0
            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1547
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1548
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1549
0
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1550
1551
            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
1552
            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
1553
            // and 2 if the high bit was set)
1554
0
            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
1555
0
            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
1556
0
            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
1557
0
            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
1558
1559
0
            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
1560
0
            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
1561
0
            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
1562
0
            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
1563
1564
0
            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
1565
0
            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
1566
0
            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
1567
0
            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
1568
1569
            // multiply with scales
1570
0
            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
1571
0
            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
1572
0
            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
1573
0
            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
1574
1575
            // accumulate
1576
0
            p16_0 = _mm256_add_epi32(p16_0, p16_1);
1577
0
            p16_2 = _mm256_add_epi32(p16_2, p16_3);
1578
0
            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
1579
1580
0
        }
1581
1582
        // multiply with block scale and accumulate
1583
0
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
1584
1585
0
    }
1586
1587
0
    *s = hsum_float_8(acc);
1588
1589
#elif defined __AVX__
1590
1591
    const __m128i m3 = _mm_set1_epi8(3);
1592
    const __m128i mone = _mm_set1_epi8(1);
1593
    const __m128i m32 = _mm_set1_epi8(32);
1594
    const __m128i m2 = _mm_set1_epi8(2);
1595
1596
    __m256 acc = _mm256_setzero_ps();
1597
1598
    const uint32_t *aux;
1599
1600
    for (int i = 0; i < nb; ++i) {
1601
1602
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1603
1604
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1605
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1606
1607
        // Set up scales
1608
        aux = (const uint32_t *)x[i].scales;
1609
        __m128i scales128 = _mm_set_epi32(
1610
                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1611
                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1612
                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1613
                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1614
        scales128 = _mm_sub_epi8(scales128, m32);
1615
        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
1616
        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
1617
        const __m128i scales[2] = { scales_0, scales_1 };
1618
1619
        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
1620
        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
1621
        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
1622
1623
        // integer accumulator
1624
        __m128i sumi_0 = _mm_setzero_si128();
1625
        __m128i sumi_1 = _mm_setzero_si128();
1626
1627
        for (int j = 0; j < QK_K/128; ++j) {
1628
            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
1629
            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1630
            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1631
1632
            // prepare low and high bits
1633
            const int bit = j << 2;
1634
1635
            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
1636
            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
1637
            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
1638
            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
1639
1640
            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
1641
            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
1642
            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1643
            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1644
1645
            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
1646
            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
1647
            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
1648
            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
1649
1650
            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
1651
            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
1652
            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
1653
            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
1654
1655
            // load Q8 quants from block_q8_K.qs[QK_K]
1656
            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1657
            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1658
            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1659
            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1660
            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1661
            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1662
            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1663
            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1664
1665
            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
1666
            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
1667
            // and 2 if the high bit was set)
1668
            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
1669
            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
1670
            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
1671
            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
1672
            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
1673
            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
1674
            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
1675
            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
1676
1677
            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
1678
            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
1679
            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
1680
            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
1681
            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
1682
            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
1683
            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
1684
            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
1685
1686
            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
1687
            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
1688
            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
1689
            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
1690
            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
1691
            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
1692
            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
1693
            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
1694
1695
            // multiply with scales
1696
            __m128i shuffle = _mm_set1_epi16(0x0100);
1697
            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
1698
            shuffle = _mm_add_epi16(shuffle, m2);
1699
            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
1700
            shuffle = _mm_add_epi16(shuffle, m2);
1701
            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
1702
            shuffle = _mm_add_epi16(shuffle, m2);
1703
            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
1704
            shuffle = _mm_add_epi16(shuffle, m2);
1705
            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
1706
            shuffle = _mm_add_epi16(shuffle, m2);
1707
            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
1708
            shuffle = _mm_add_epi16(shuffle, m2);
1709
            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
1710
            shuffle = _mm_add_epi16(shuffle, m2);
1711
            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
1712
1713
            // accumulate
1714
            p16_0 = _mm_add_epi32(p16_0, p16_1);
1715
            p16_2 = _mm_add_epi32(p16_2, p16_3);
1716
            p16_4 = _mm_add_epi32(p16_4, p16_5);
1717
            p16_6 = _mm_add_epi32(p16_6, p16_7);
1718
            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
1719
            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
1720
1721
        }
1722
1723
        // multiply with block scale and accumulate
1724
        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1725
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
1726
1727
    }
1728
1729
    *s = hsum_float_8(acc);
1730
1731
#else
1732
    UNUSED(kmask1);
1733
    UNUSED(kmask2);
1734
    UNUSED(x);
1735
    UNUSED(y);
1736
    UNUSED(nb);
1737
    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1738
#endif
1739
0
}
1740
1741
0
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1742
0
    assert(n % QK_K == 0);
1743
0
    assert(nrc == 1);
1744
0
    UNUSED(nrc);
1745
0
    UNUSED(bx);
1746
0
    UNUSED(by);
1747
0
    UNUSED(bs);
1748
1749
0
    const block_q4_K * GGML_RESTRICT x = vx;
1750
0
    const block_q8_K * GGML_RESTRICT y = vy;
1751
1752
0
    const int nb = n / QK_K;
1753
1754
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
1755
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
1756
0
    static const uint32_t kmask3 = 0x03030303;
1757
1758
0
    uint32_t utmp[4];
1759
1760
0
#if defined __AVX2__
1761
1762
0
    const __m256i m4 = _mm256_set1_epi8(0xF);
1763
1764
0
    __m256 acc = _mm256_setzero_ps();
1765
0
    __m128 acc_m = _mm_setzero_ps();
1766
1767
0
   for (int i = 0; i < nb; ++i) {
1768
1769
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1770
0
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1771
1772
0
        memcpy(utmp, x[i].scales, 12);
1773
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1774
0
        const uint32_t uaux = utmp[1] & kmask1;
1775
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1776
0
        utmp[2] = uaux;
1777
0
        utmp[0] &= kmask1;
1778
1779
0
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1780
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1781
1782
0
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
1783
1784
0
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
1785
0
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
1786
0
        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
1787
0
        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
1788
1789
0
        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
1790
0
        const __m256i scales = MM256_SET_M128I(sc128, sc128);
1791
1792
0
        __m256i sumi = _mm256_setzero_si256();
1793
1794
0
        for (int j = 0; j < QK_K/64; ++j) {
1795
1796
0
            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
1797
0
            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
1798
1799
0
            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
1800
0
            const __m256i q4l = _mm256_and_si256(q4bits, m4);
1801
0
            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
1802
1803
0
            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1804
0
            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
1805
0
            p16l = _mm256_madd_epi16(scale_l, p16l);
1806
1807
0
            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1808
0
            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
1809
0
            p16h = _mm256_madd_epi16(scale_h, p16h);
1810
0
            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
1811
1812
0
            sumi = _mm256_add_epi32(sumi, sumj);
1813
0
        }
1814
1815
0
        __m256 vd = _mm256_set1_ps(d);
1816
0
        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
1817
1818
0
    }
1819
1820
0
    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
1821
0
    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
1822
1823
0
    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
1824
1825
#elif defined __AVX__
1826
1827
    const __m128i m4 = _mm_set1_epi8(0xF);
1828
    const __m128i m2 = _mm_set1_epi8(0x2);
1829
1830
    __m256 acc = _mm256_setzero_ps();
1831
    __m128 acc_m = _mm_setzero_ps();
1832
1833
   for (int i = 0; i < nb; ++i) {
1834
1835
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1836
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1837
1838
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1839
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1840
1841
        memcpy(utmp, x[i].scales, 12);
1842
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1843
        const uint32_t uaux = utmp[1] & kmask1;
1844
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1845
        utmp[2] = uaux;
1846
        utmp[0] &= kmask1;
1847
1848
        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
1849
        const __m128i scales = _mm_cvtepu8_epi16(utmps);
1850
        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
1851
1852
        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
1853
        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
1854
        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
1855
        const __m128i prod = _mm_madd_epi16(mins, q8s);
1856
        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
1857
1858
        __m128i sumi_0 = _mm_setzero_si128();
1859
        __m128i sumi_1 = _mm_setzero_si128();
1860
1861
        __m128i shuffle = _mm_set1_epi16(0x0100);
1862
        for (int j = 0; j < QK_K/64; ++j) {
1863
1864
            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
1865
            shuffle = _mm_add_epi16(shuffle, m2);
1866
            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
1867
            shuffle = _mm_add_epi16(shuffle, m2);
1868
1869
            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
1870
            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
1871
            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
1872
            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
1873
            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
1874
            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
1875
1876
            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1877
            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
1878
            p16l = _mm_madd_epi16(scale_l, p16l);
1879
            sumi_0 = _mm_add_epi32(sumi_0, p16l);
1880
            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1881
            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
1882
            p16l = _mm_madd_epi16(scale_l, p16l);
1883
            sumi_1 = _mm_add_epi32(sumi_1, p16l);
1884
1885
            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1886
            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
1887
            p16h = _mm_madd_epi16(scale_h, p16h);
1888
            sumi_0 = _mm_add_epi32(sumi_0, p16h);
1889
            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1890
            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
1891
            p16h = _mm_madd_epi16(scale_h, p16h);
1892
            sumi_1 = _mm_add_epi32(sumi_1, p16h);
1893
1894
        }
1895
1896
        __m256 vd = _mm256_set1_ps(d);
1897
        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1898
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
1899
1900
    }
1901
1902
    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
1903
    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
1904
1905
    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
1906
1907
#else
1908
    UNUSED(x);
1909
    UNUSED(y);
1910
    UNUSED(nb);
1911
    UNUSED(kmask1);
1912
    UNUSED(kmask2);
1913
    UNUSED(kmask3);
1914
    UNUSED(utmp);
1915
    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1916
#endif
1917
0
}
1918
1919
0
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
1920
0
    assert(n % QK_K == 0);
1921
0
    assert(nrc == 1);
1922
0
    UNUSED(nrc);
1923
0
    UNUSED(bx);
1924
0
    UNUSED(by);
1925
0
    UNUSED(bs);
1926
1927
0
    const block_q5_K * GGML_RESTRICT x = vx;
1928
0
    const block_q8_K * GGML_RESTRICT y = vy;
1929
1930
0
    const int nb = n / QK_K;
1931
1932
0
    static const uint32_t kmask1 = 0x3f3f3f3f;
1933
0
    static const uint32_t kmask2 = 0x0f0f0f0f;
1934
0
    static const uint32_t kmask3 = 0x03030303;
1935
1936
0
    uint32_t utmp[4];
1937
1938
0
#if defined __AVX2__
1939
1940
0
    const __m256i m4 = _mm256_set1_epi8(0xF);
1941
0
    const __m128i mzero = _mm_setzero_si128();
1942
0
    const __m256i mone  = _mm256_set1_epi8(1);
1943
1944
0
    __m256 acc = _mm256_setzero_ps();
1945
1946
0
    float summs = 0.f;
1947
1948
0
    for (int i = 0; i < nb; ++i) {
1949
0
        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1950
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1951
1952
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1953
0
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1954
1955
0
        memcpy(utmp, x[i].scales, 12);
1956
0
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1957
0
        const uint32_t uaux = utmp[1] & kmask1;
1958
0
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1959
0
        utmp[2] = uaux;
1960
0
        utmp[0] &= kmask1;
1961
1962
0
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
1963
1964
0
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
1965
0
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
1966
0
        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
1967
0
        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
1968
0
        summs += dmin * _mm_extract_epi32(hsum, 0);
1969
1970
0
        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
1971
0
        const __m256i scales = MM256_SET_M128I(sc128, sc128);
1972
1973
0
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
1974
0
        __m256i hmask = mone;
1975
1976
0
        __m256i sumi = _mm256_setzero_si256();
1977
1978
0
        int bit = 0;
1979
1980
0
        for (int j = 0; j < QK_K/64; ++j) {
1981
1982
0
            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
1983
0
            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
1984
1985
0
            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
1986
1987
0
            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
1988
0
            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
1989
0
            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
1990
0
            hmask = _mm256_slli_epi16(hmask, 1);
1991
1992
0
            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
1993
0
            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
1994
0
            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
1995
0
            hmask = _mm256_slli_epi16(hmask, 1);
1996
1997
0
            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1998
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1999
2000
0
            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
2001
0
            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
2002
2003
0
            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
2004
0
            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
2005
2006
0
            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
2007
2008
0
        }
2009
2010
0
        __m256 vd = _mm256_set1_ps(d);
2011
0
        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
2012
2013
0
    }
2014
2015
0
    *s = hsum_float_8(acc) + summs;
2016
2017
#elif defined __AVX__
2018
2019
    const __m128i m4 = _mm_set1_epi8(0xF);
2020
    const __m128i mzero = _mm_setzero_si128();
2021
    const __m128i mone  = _mm_set1_epi8(1);
2022
    const __m128i m2 = _mm_set1_epi8(2);
2023
2024
    __m256 acc = _mm256_setzero_ps();
2025
2026
    float summs = 0.f;
2027
2028
    for (int i = 0; i < nb; ++i) {
2029
2030
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2031
        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2032
2033
        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
2034
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2035
2036
        memcpy(utmp, x[i].scales, 12);
2037
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2038
        const uint32_t uaux = utmp[1] & kmask1;
2039
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2040
        utmp[2] = uaux;
2041
        utmp[0] &= kmask1;
2042
2043
        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
2044
        const __m128i scales = _mm_cvtepu8_epi16(utmps);
2045
        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
2046
2047
        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
2048
        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
2049
        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
2050
        const __m128i prod = _mm_madd_epi16(mins, q8s);
2051
        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
2052
        summs += dmin * _mm_extract_epi32(hsum, 0);
2053
2054
        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
2055
        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
2056
        __m128i hmask = mone;
2057
2058
        __m128i sumi_0 = _mm_setzero_si128();
2059
        __m128i sumi_1 = _mm_setzero_si128();
2060
2061
        int bit = 0;
2062
2063
        __m128i shuffle = _mm_set1_epi16(0x0100);
2064
        for (int j = 0; j < QK_K/64; ++j) {
2065
2066
            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
2067
            shuffle = _mm_add_epi16(shuffle, m2);
2068
            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
2069
            shuffle = _mm_add_epi16(shuffle, m2);
2070
2071
            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
2072
            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
2073
2074
            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
2075
            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
2076
            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
2077
            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
2078
            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
2079
            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
2080
            hmask = _mm_slli_epi16(hmask, 1);
2081
2082
            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2083
            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2084
            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
2085
            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
2086
            p16_0 = _mm_madd_epi16(scale_0, p16_0);
2087
            p16_1 = _mm_madd_epi16(scale_0, p16_1);
2088
2089
            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
2090
            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
2091
            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
2092
            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
2093
            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
2094
            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
2095
            hmask = _mm_slli_epi16(hmask, 1);
2096
2097
            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2098
            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2099
            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
2100
            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
2101
            p16_2 = _mm_madd_epi16(scale_1, p16_2);
2102
            p16_3 = _mm_madd_epi16(scale_1, p16_3);
2103
2104
            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
2105
            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
2106
2107
        }
2108
2109
        __m256 vd = _mm256_set1_ps(d);
2110
        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2111
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2112
2113
    }
2114
2115
    *s = hsum_float_8(acc) + summs;
2116
2117
#else
2118
    UNUSED(x);
2119
    UNUSED(y);
2120
    UNUSED(nb);
2121
    UNUSED(kmask1);
2122
    UNUSED(kmask2);
2123
    UNUSED(kmask3);
2124
    UNUSED(utmp);
2125
    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2126
#endif
2127
0
}
2128
2129
0
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2130
0
    assert(n % QK_K == 0);
2131
0
    assert(nrc == 1);
2132
0
    UNUSED(nrc);
2133
0
    UNUSED(bx);
2134
0
    UNUSED(by);
2135
0
    UNUSED(bs);
2136
2137
0
    const block_q6_K * GGML_RESTRICT x = vx;
2138
0
    const block_q8_K * GGML_RESTRICT y = vy;
2139
2140
0
    const int nb = n / QK_K;
2141
2142
0
#if defined __AVX2__
2143
2144
0
    const __m256i m4 = _mm256_set1_epi8(0xF);
2145
0
    const __m256i m2 = _mm256_set1_epi8(3);
2146
0
    const __m256i m32s = _mm256_set1_epi8(32);
2147
2148
0
    __m256 acc = _mm256_setzero_ps();
2149
2150
0
    for (int i = 0; i < nb; ++i) {
2151
2152
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2153
2154
0
        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2155
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
2156
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2157
2158
0
        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
2159
2160
0
        __m256i sumi = _mm256_setzero_si256();
2161
2162
0
        int is = 0;
2163
2164
0
        for (int j = 0; j < QK_K/128; ++j) {
2165
2166
0
            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
2167
0
            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
2168
0
            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
2169
0
            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
2170
0
            is += 4;
2171
2172
0
            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
2173
0
            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
2174
0
            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
2175
2176
0
            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
2177
0
            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
2178
0
            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
2179
0
            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
2180
2181
0
            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
2182
0
            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
2183
0
            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
2184
0
            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
2185
2186
0
            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2187
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2188
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2189
0
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2190
2191
0
            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
2192
0
            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
2193
0
            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
2194
0
            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
2195
2196
0
            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
2197
0
            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
2198
0
            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
2199
0
            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
2200
2201
0
            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
2202
0
            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
2203
0
            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
2204
0
            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
2205
2206
0
            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
2207
0
            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
2208
0
            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
2209
0
            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
2210
2211
0
            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
2212
0
            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
2213
2214
0
        }
2215
2216
0
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
2217
0
    }
2218
2219
0
    *s = hsum_float_8(acc);
2220
2221
#elif defined __AVX__
2222
2223
    const __m128i m3 = _mm_set1_epi8(3);
2224
    const __m128i m15 = _mm_set1_epi8(15);
2225
2226
    __m256 acc = _mm256_setzero_ps();
2227
2228
    for (int i = 0; i < nb; ++i) {
2229
2230
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2231
2232
        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2233
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
2234
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2235
2236
        // handle the q6_k -32 offset separately using bsums
2237
        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
2238
        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
2239
        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
2240
        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
2241
        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
2242
        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
2243
        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
2244
2245
        __m128i sumi_0 = _mm_setzero_si128();
2246
        __m128i sumi_1 = _mm_setzero_si128();
2247
2248
        int is = 0;
2249
2250
        for (int j = 0; j < QK_K/128; ++j) {
2251
2252
            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
2253
            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
2254
2255
            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
2256
            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
2257
            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
2258
            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
2259
            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
2260
            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
2261
            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
2262
            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
2263
2264
            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2265
            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2266
            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2267
            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2268
2269
            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
2270
            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
2271
            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
2272
            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
2273
            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
2274
            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
2275
            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
2276
            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
2277
2278
            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2279
            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2280
            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2281
            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2282
            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2283
            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2284
            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2285
            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2286
2287
            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
2288
            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
2289
            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
2290
            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
2291
            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
2292
            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
2293
            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
2294
            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
2295
2296
            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
2297
            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
2298
            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
2299
            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
2300
            is += 4;
2301
2302
            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
2303
            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
2304
            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
2305
            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
2306
            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
2307
            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
2308
            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
2309
            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
2310
2311
            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
2312
            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
2313
            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
2314
            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
2315
2316
        }
2317
2318
        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
2319
        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
2320
        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2321
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
2322
    }
2323
2324
    *s = hsum_float_8(acc);
2325
2326
#else
2327
    UNUSED(x);
2328
    UNUSED(y);
2329
    UNUSED(nb);
2330
    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2331
#endif
2332
0
}
2333
2334
#if defined (__AVX__) || defined (__AVX2__)
2335
static const int8_t keven_signs_q2xs[1024] = {
2336
     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
2337
     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
2338
     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
2339
     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
2340
     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
2341
     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
2342
     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
2343
     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
2344
     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
2345
     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
2346
     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
2347
     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
2348
     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
2349
     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
2350
     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
2351
     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
2352
     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
2353
     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
2354
     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
2355
     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
2356
     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
2357
     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
2358
     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
2359
     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
2360
     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
2361
     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
2362
     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
2363
     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
2364
     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
2365
     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
2366
     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
2367
     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
2368
};
2369
#endif
2370
2371
0
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2372
0
    assert(n % QK_K == 0);
2373
0
    assert(nrc == 1);
2374
0
    UNUSED(nrc);
2375
0
    UNUSED(bx);
2376
0
    UNUSED(by);
2377
0
    UNUSED(bs);
2378
2379
0
    const block_iq2_xxs * GGML_RESTRICT x = vx;
2380
0
    const block_q8_K    * GGML_RESTRICT y = vy;
2381
2382
0
    const int nb = n / QK_K;
2383
2384
0
#if defined(__AVX2__)
2385
2386
0
    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2387
2388
0
    uint32_t aux32[4];
2389
0
    const uint8_t * aux8 = (const uint8_t *)aux32;
2390
2391
0
    __m256 accumf = _mm256_setzero_ps();
2392
0
    for (int i = 0; i < nb; ++i) {
2393
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2394
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2395
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
2396
0
        __m256i sumi1 = _mm256_setzero_si256();
2397
0
        __m256i sumi2 = _mm256_setzero_si256();
2398
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2399
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2400
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2401
0
            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
2402
0
            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
2403
0
            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
2404
0
            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
2405
0
                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
2406
0
            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
2407
0
                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
2408
0
            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
2409
0
            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
2410
0
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
2411
0
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
2412
0
            const uint16_t ls1 = aux32[1] >> 28;
2413
0
            const uint16_t ls2 = aux32[3] >> 28;
2414
0
            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
2415
0
            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
2416
0
            sumi1 = _mm256_add_epi32(sumi1, p1);
2417
0
            sumi2 = _mm256_add_epi32(sumi2, p2);
2418
0
        }
2419
2420
0
        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
2421
2422
0
    }
2423
2424
0
    *s = 0.125f * hsum_float_8(accumf);
2425
2426
#elif defined(__AVX__)
2427
    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2428
2429
    uint32_t aux32[4];
2430
    const uint8_t * aux8 = (const uint8_t *)aux32;
2431
2432
    __m256 accumf = _mm256_setzero_ps();
2433
    for (int i = 0; i < nb; ++i) {
2434
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2435
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2436
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
2437
        __m128i sumi1_0 = _mm_setzero_si128();
2438
        __m128i sumi1_1 = _mm_setzero_si128();
2439
        __m128i sumi2_0 = _mm_setzero_si128();
2440
        __m128i sumi2_1 = _mm_setzero_si128();
2441
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2442
            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2443
            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2444
            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2445
            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2446
            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
2447
            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
2448
            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
2449
            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
2450
            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
2451
            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
2452
            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
2453
            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
2454
            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
2455
            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
2456
            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
2457
            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
2458
            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
2459
            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
2460
            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
2461
            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
2462
            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
2463
            const uint16_t ls1 = aux32[1] >> 28;
2464
            const uint16_t ls2 = aux32[3] >> 28;
2465
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
2466
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
2467
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
2468
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
2469
            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
2470
            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
2471
            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
2472
            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
2473
        }
2474
2475
        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
2476
2477
    }
2478
2479
    *s = 0.125f * hsum_float_8(accumf);
2480
2481
#else
2482
    UNUSED(x);
2483
    UNUSED(y);
2484
    UNUSED(nb);
2485
    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2486
#endif
2487
0
}
2488
2489
0
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2490
0
    assert(n % QK_K == 0);
2491
0
    assert(nrc == 1);
2492
0
    UNUSED(nrc);
2493
0
    UNUSED(bx);
2494
0
    UNUSED(by);
2495
0
    UNUSED(bs);
2496
2497
0
    const block_iq2_xs * GGML_RESTRICT x = vx;
2498
0
    const block_q8_K   * GGML_RESTRICT y = vy;
2499
2500
0
    const int nb = n / QK_K;
2501
2502
0
#if defined(__AVX2__)
2503
2504
0
    const __m256i mone = _mm256_set1_epi8(1);
2505
0
    static const char block_sign_shuffle_mask_1[32] = {
2506
0
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
2507
0
        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
2508
0
    };
2509
0
    static const char block_sign_shuffle_mask_2[32] = {
2510
0
        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
2511
0
        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
2512
0
    };
2513
0
    static const uint8_t bit_selector_mask_bytes[32] = {
2514
0
        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2515
0
        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2516
0
    };
2517
2518
0
    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
2519
0
    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
2520
0
    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
2521
2522
0
    static const uint8_t k_bit_helper[32] = {
2523
0
        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2524
0
        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2525
0
    };
2526
0
    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
2527
0
    const __m256i m511 = _mm256_set1_epi16(511);
2528
0
    const __m128i m4 = _mm_set1_epi8(0xf);
2529
0
    const __m128i m1 = _mm_set1_epi8(1);
2530
2531
0
    uint64_t aux64;
2532
2533
    // somewhat hacky, but gives a significant boost in performance
2534
0
    __m256i aux_gindex;
2535
0
    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
2536
2537
0
    __m256 accumf = _mm256_setzero_ps();
2538
0
    for (int i = 0; i < nb; ++i) {
2539
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2540
0
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2541
0
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
2542
2543
0
        memcpy(&aux64, x[i].scales, 8);
2544
0
        __m128i stmp = _mm_set1_epi64x(aux64);
2545
0
        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
2546
0
        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
2547
2548
0
        __m256i sumi1 = _mm256_setzero_si256();
2549
0
        __m256i sumi2 = _mm256_setzero_si256();
2550
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
2551
2552
0
            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
2553
0
            aux_gindex = _mm256_and_si256(q2_data, m511);
2554
2555
0
            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
2556
0
            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
2557
0
            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
2558
2559
0
            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
2560
0
            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
2561
2562
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2563
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2564
0
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2565
0
            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2566
2567
0
            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
2568
0
                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
2569
0
            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
2570
0
                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
2571
0
            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
2572
0
                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
2573
0
            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
2574
0
                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
2575
2576
0
            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
2577
0
            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
2578
0
            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
2579
0
            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
2580
2581
0
            __m256i signs;
2582
0
            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
2583
0
            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2584
0
            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
2585
2586
0
            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
2587
0
            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2588
0
            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
2589
2590
0
            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
2591
0
            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2592
0
            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
2593
2594
0
            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
2595
0
            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2596
0
            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
2597
2598
0
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
2599
0
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
2600
0
            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
2601
0
            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
2602
2603
0
            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
2604
0
            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
2605
0
            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
2606
0
            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
2607
2608
0
            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
2609
0
            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
2610
0
            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
2611
0
            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
2612
0
        }
2613
2614
0
        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
2615
2616
0
    }
2617
2618
0
    *s = 0.125f * hsum_float_8(accumf);
2619
2620
#elif defined(__AVX__)
2621
    const __m128i mone = _mm_set1_epi8(1);
2622
    static const char block_sign_shuffle_mask_1[32] = {
2623
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
2624
        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
2625
    };
2626
    static const char block_sign_shuffle_mask_2[32] = {
2627
        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
2628
        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
2629
    };
2630
    static const uint8_t bit_selector_mask_bytes[32] = {
2631
        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2632
        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2633
    };
2634
2635
    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
2636
    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
2637
    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
2638
    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
2639
    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
2640
    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
2641
2642
    static const uint8_t k_bit_helper[32] = {
2643
        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2644
        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2645
    };
2646
    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
2647
    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
2648
    const __m128i m511 = _mm_set1_epi16(511);
2649
    const __m128i m4 = _mm_set1_epi8(0xf);
2650
    const __m128i m1 = _mm_set1_epi8(1);
2651
2652
    uint64_t aux64;
2653
2654
    // somewhat hacky, but gives a significant boost in performance
2655
    __m256i aux_gindex;
2656
    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
2657
2658
    __m256 accumf = _mm256_setzero_ps();
2659
    for (int i = 0; i < nb; ++i) {
2660
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2661
        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2662
        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
2663
2664
        memcpy(&aux64, x[i].scales, 8);
2665
        __m128i stmp = _mm_set1_epi64x(aux64);
2666
        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
2667
        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
2668
2669
        __m128i sumi1_0 = _mm_setzero_si128();
2670
        __m128i sumi1_1 = _mm_setzero_si128();
2671
        __m128i sumi2_0 = _mm_setzero_si128();
2672
        __m128i sumi2_1 = _mm_setzero_si128();
2673
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
2674
2675
            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
2676
            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
2677
            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
2678
2679
            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
2680
            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
2681
            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
2682
            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
2683
            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
2684
            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
2685
2686
            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
2687
            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
2688
            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
2689
            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
2690
2691
            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2692
            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2693
            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2694
            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2695
            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2696
            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2697
            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2698
            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2699
2700
            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
2701
            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
2702
            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
2703
            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
2704
            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
2705
            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
2706
            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
2707
            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
2708
2709
            // AVX2 full_signs_1 is full_sign_bits_0 here
2710
            // AVX2 full_signs_2 is full_sign_bits_1 here
2711
            __m128i signs_0, signs_1;
2712
            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
2713
            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
2714
            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2715
            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2716
            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
2717
            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
2718
2719
            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
2720
            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
2721
            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2722
            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2723
            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
2724
            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
2725
2726
            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
2727
            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
2728
            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2729
            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2730
            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
2731
            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
2732
2733
            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
2734
            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
2735
            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2736
            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2737
            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
2738
            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
2739
2740
            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
2741
            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
2742
            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
2743
            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
2744
            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
2745
            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
2746
            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
2747
            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
2748
2749
            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
2750
            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
2751
            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
2752
            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
2753
            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
2754
            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
2755
            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
2756
            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
2757
            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
2758
            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
2759
            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
2760
            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
2761
2762
            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
2763
            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
2764
            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
2765
            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
2766
            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
2767
            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
2768
            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
2769
            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
2770
        }
2771
2772
        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
2773
2774
    }
2775
2776
    *s = 0.125f * hsum_float_8(accumf);
2777
2778
#else
2779
    UNUSED(x);
2780
    UNUSED(y);
2781
    UNUSED(nb);
2782
    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2783
#endif
2784
0
}
2785
2786
0
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2787
0
    assert(n % QK_K == 0);
2788
0
    assert(nrc == 1);
2789
0
    UNUSED(nrc);
2790
0
    UNUSED(bx);
2791
0
    UNUSED(by);
2792
0
    UNUSED(bs);
2793
2794
0
    const block_iq2_s * GGML_RESTRICT x = vx;
2795
0
    const block_q8_K  * GGML_RESTRICT y = vy;
2796
2797
0
    const int nb = n / QK_K;
2798
2799
0
#if defined(__AVX2__)
2800
2801
0
   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2802
0
                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
2803
0
   };
2804
2805
0
    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2806
0
                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2807
0
    };
2808
2809
0
    const __m128i m4 = _mm_set1_epi8(0xf);
2810
0
    const __m128i m1 = _mm_set1_epi8(1);
2811
2812
0
    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
2813
0
    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
2814
2815
0
    uint64_t aux64;
2816
2817
0
    __m256 accumf = _mm256_setzero_ps();
2818
0
    for (int i = 0; i < nb; ++i) {
2819
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2820
0
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
2821
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
2822
0
        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
2823
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2824
2825
0
        memcpy(&aux64, x[i].scales, 8);
2826
0
        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
2827
0
        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
2828
2829
0
        __m256i sumi1 = _mm256_setzero_si256();
2830
0
        __m256i sumi2 = _mm256_setzero_si256();
2831
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2832
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2833
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2834
0
            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
2835
0
                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
2836
0
                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
2837
0
                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
2838
0
            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
2839
0
                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
2840
0
                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
2841
0
                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
2842
0
            qs += 8;
2843
2844
0
            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
2845
0
            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
2846
0
            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
2847
0
            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
2848
2849
0
            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
2850
0
            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
2851
0
            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
2852
0
            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
2853
2854
0
            signs += 4;
2855
2856
0
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
2857
0
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
2858
2859
0
            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
2860
0
            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
2861
0
            sumi1 = _mm256_add_epi32(sumi1, p1);
2862
0
            sumi2 = _mm256_add_epi32(sumi2, p2);
2863
0
        }
2864
2865
0
        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
2866
2867
0
    }
2868
2869
0
    *s = 0.125f * hsum_float_8(accumf);
2870
2871
#elif defined(__AVX__)
2872
   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2873
                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
2874
   };
2875
2876
    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2877
                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2878
    };
2879
2880
    const __m128i m4 = _mm_set1_epi8(0xf);
2881
    const __m128i m1 = _mm_set1_epi8(1);
2882
2883
    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
2884
    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
2885
    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
2886
    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
2887
2888
    uint64_t aux64;
2889
2890
    __m256 accumf = _mm256_setzero_ps();
2891
    for (int i = 0; i < nb; ++i) {
2892
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2893
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
2894
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
2895
        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
2896
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2897
2898
        memcpy(&aux64, x[i].scales, 8);
2899
        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
2900
        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
2901
        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
2902
2903
        __m128i sumi1_0 = _mm_setzero_si128();
2904
        __m128i sumi1_1 = _mm_setzero_si128();
2905
        __m128i sumi2_0 = _mm_setzero_si128();
2906
        __m128i sumi2_1 = _mm_setzero_si128();
2907
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2908
            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2909
            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2910
            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2911
            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2912
            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
2913
                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
2914
            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
2915
                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
2916
            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
2917
                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
2918
            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
2919
                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
2920
            qs += 8;
2921
2922
            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
2923
            __m128i aux128_1 = aux128_0;
2924
            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
2925
            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
2926
            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
2927
            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
2928
            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
2929
            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
2930
2931
            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
2932
            aux128_1 = aux128_0;
2933
            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
2934
            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
2935
            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
2936
            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
2937
            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
2938
            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
2939
2940
            signs += 4;
2941
2942
            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
2943
            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
2944
            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
2945
            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
2946
2947
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
2948
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
2949
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
2950
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
2951
            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
2952
            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
2953
            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
2954
            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
2955
        }
2956
2957
        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
2958
2959
    }
2960
2961
    *s = 0.125f * hsum_float_8(accumf);
2962
2963
#else
2964
    UNUSED(x);
2965
    UNUSED(y);
2966
    UNUSED(nb);
2967
    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2968
#endif
2969
0
}
2970
2971
0
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2972
0
    assert(n % QK_K == 0);
2973
0
    assert(nrc == 1);
2974
0
    UNUSED(nrc);
2975
0
    UNUSED(bx);
2976
0
    UNUSED(by);
2977
0
    UNUSED(bs);
2978
2979
0
    const block_iq3_xxs * GGML_RESTRICT x = vx;
2980
0
    const block_q8_K    * GGML_RESTRICT y = vy;
2981
2982
0
    const int nb = n / QK_K;
2983
2984
0
#if defined(__AVX2__)
2985
2986
0
    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2987
2988
0
    uint32_t aux32[2];
2989
2990
0
    __m256 accumf = _mm256_setzero_ps();
2991
0
    for (int i = 0; i < nb; ++i) {
2992
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2993
0
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2994
0
        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2995
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
2996
0
        __m256i sumi1 = _mm256_setzero_si256();
2997
0
        __m256i sumi2 = _mm256_setzero_si256();
2998
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2999
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3000
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3001
0
            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
3002
0
                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3003
0
            q3 += 8;
3004
0
            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
3005
0
                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3006
0
            q3 += 8;
3007
0
            memcpy(aux32, gas, 8); gas += 8;
3008
0
            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
3009
0
                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
3010
0
            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
3011
0
                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
3012
0
            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
3013
0
            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
3014
0
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
3015
0
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
3016
0
            const uint16_t ls1 = aux32[0] >> 28;
3017
0
            const uint16_t ls2 = aux32[1] >> 28;
3018
0
            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
3019
0
            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
3020
0
            sumi1 = _mm256_add_epi32(sumi1, p1);
3021
0
            sumi2 = _mm256_add_epi32(sumi2, p2);
3022
0
        }
3023
3024
0
        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
3025
3026
0
    }
3027
3028
0
    *s = 0.25f * hsum_float_8(accumf);
3029
3030
#elif defined(__AVX__)
3031
    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3032
3033
    uint32_t aux32[2];
3034
3035
    __m256 accumf = _mm256_setzero_ps();
3036
    for (int i = 0; i < nb; ++i) {
3037
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3038
        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3039
        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3040
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
3041
        __m128i sumi1_0 = _mm_setzero_si128();
3042
        __m128i sumi1_1 = _mm_setzero_si128();
3043
        __m128i sumi2_0 = _mm_setzero_si128();
3044
        __m128i sumi2_1 = _mm_setzero_si128();
3045
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3046
            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3047
            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3048
            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3049
            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3050
            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3051
            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
3052
            q3 += 8;
3053
            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3054
            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
3055
            q3 += 8;
3056
            memcpy(aux32, gas, 8); gas += 8;
3057
            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
3058
            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
3059
            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
3060
            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
3061
            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
3062
            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
3063
            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
3064
            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
3065
            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
3066
            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
3067
            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
3068
            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3069
            const uint16_t ls1 = aux32[0] >> 28;
3070
            const uint16_t ls2 = aux32[1] >> 28;
3071
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
3072
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
3073
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
3074
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
3075
            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
3076
            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
3077
            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
3078
            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
3079
        }
3080
3081
        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3082
3083
    }
3084
3085
    *s = 0.25f * hsum_float_8(accumf);
3086
3087
#else
3088
    UNUSED(x);
3089
    UNUSED(y);
3090
    UNUSED(nb);
3091
    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3092
#endif
3093
0
}
3094
3095
0
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3096
0
    assert(n % QK_K == 0);
3097
0
    assert(nrc == 1);
3098
0
    UNUSED(nrc);
3099
0
    UNUSED(bx);
3100
0
    UNUSED(by);
3101
0
    UNUSED(bs);
3102
3103
0
    const block_iq3_s * GGML_RESTRICT x = vx;
3104
0
    const block_q8_K  * GGML_RESTRICT y = vy;
3105
3106
0
    const int nb = n / QK_K;
3107
3108
0
#if defined(__AVX2__)
3109
3110
0
   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3111
0
                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3112
0
   };
3113
3114
0
    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3115
0
                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3116
0
    };
3117
3118
0
    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
3119
0
    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
3120
3121
0
    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3122
0
    const __m256i idx_mask  = _mm256_set1_epi32(256);
3123
3124
0
    typedef union {
3125
0
        __m256i  vec[2];
3126
0
        uint32_t index[16];
3127
0
    } index_t;
3128
3129
0
    index_t idx;
3130
3131
0
    __m256 accumf = _mm256_setzero_ps();
3132
0
    for (int i = 0; i < nb; ++i) {
3133
0
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3134
0
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
3135
0
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
3136
0
        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
3137
0
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
3138
0
        __m256i sumi1 = _mm256_setzero_si256();
3139
0
        __m256i sumi2 = _mm256_setzero_si256();
3140
0
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3141
0
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3142
0
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3143
0
            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
3144
0
            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
3145
0
            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
3146
0
            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
3147
0
            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
3148
0
            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
3149
0
            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
3150
3151
            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
3152
            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
3153
            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
3154
0
            const __m256i q2_1 = _mm256_set_epi32(
3155
0
                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
3156
0
                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
3157
0
            );
3158
0
            const __m256i q2_2 = _mm256_set_epi32(
3159
0
                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
3160
0
                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
3161
0
            );
3162
3163
0
            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
3164
0
            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3165
0
            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
3166
0
            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
3167
3168
0
            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
3169
0
            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3170
0
            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
3171
0
            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
3172
3173
0
            signs += 4;
3174
3175
0
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
3176
0
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
3177
0
            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
3178
0
            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
3179
0
            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
3180
0
            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
3181
0
            sumi1 = _mm256_add_epi32(sumi1, p1);
3182
0
            sumi2 = _mm256_add_epi32(sumi2, p2);
3183
0
        }
3184
3185
0
        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
3186
3187
0
    }
3188
3189
0
    *s = hsum_float_8(accumf);
3190
3191
#elif defined(__AVX__)
3192
   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3193
                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3194
   };
3195
3196
    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3197
                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3198
    };
3199
3200
    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
3201
    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
3202
    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
3203
    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
3204
3205
    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
3206
    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
3207
    const __m128i idx_mask  = _mm_set1_epi32(256);
3208
3209
    typedef union {
3210
        __m128i  vec[4];
3211
        uint32_t index[16];
3212
    } index_t;
3213
3214
    index_t idx;
3215
3216
    __m256 accumf = _mm256_setzero_ps();
3217
    for (int i = 0; i < nb; ++i) {
3218
        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3219
        const uint8_t * GGML_RESTRICT qs = x[i].qs;
3220
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
3221
        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
3222
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
3223
        __m128i sumi1_0 = _mm_setzero_si128();
3224
        __m128i sumi1_1 = _mm_setzero_si128();
3225
        __m128i sumi2_0 = _mm_setzero_si128();
3226
        __m128i sumi2_1 = _mm_setzero_si128();
3227
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3228
            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3229
            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3230
            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3231
            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3232
            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
3233
            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
3234
            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
3235
            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
3236
            idx.vec[1] = idx.vec[0];
3237
            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
3238
            idx.vec[3] = idx.vec[2];
3239
3240
            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
3241
            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
3242
            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
3243
            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
3244
3245
            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
3246
            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
3247
            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
3248
            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
3249
3250
            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
3251
            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
3252
            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
3253
            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
3254
3255
            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
3256
            __m128i aux128_1 = aux128_0;
3257
            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3258
            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3259
            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3260
            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3261
            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
3262
            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
3263
3264
            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
3265
            aux128_1 = aux128_0;
3266
            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3267
            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3268
            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3269
            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3270
            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
3271
            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
3272
3273
            signs += 4;
3274
3275
            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
3276
            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
3277
            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
3278
            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3279
            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
3280
            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
3281
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
3282
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
3283
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
3284
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
3285
            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
3286
            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
3287
            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
3288
            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
3289
        }
3290
3291
        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3292
3293
    }
3294
3295
    *s = hsum_float_8(accumf);
3296
3297
#else
3298
    UNUSED(x);
3299
    UNUSED(y);
3300
    UNUSED(nb);
3301
    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3302
#endif
3303
0
}
3304
3305
0
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3306
0
    assert(n % QK_K == 0);
3307
0
    assert(nrc == 1);
3308
0
    UNUSED(nrc);
3309
0
    UNUSED(bx);
3310
0
    UNUSED(by);
3311
0
    UNUSED(bs);
3312
3313
0
    const block_iq1_s * GGML_RESTRICT x = vx;
3314
0
    const block_q8_K  * GGML_RESTRICT y = vy;
3315
3316
0
    const int nb = n / QK_K;
3317
3318
0
#if defined __AVX2__
3319
3320
0
    __m256 accum = _mm256_setzero_ps();
3321
0
    float accum1 = 0;
3322
0
    for (int i = 0; i < nb; ++i) {
3323
3324
0
        const int8_t   * q8 = y[i].qs;
3325
0
        const uint8_t  * qs = x[i].qs;
3326
0
        const uint16_t * qh = x[i].qh;
3327
3328
0
        __m256i sumi = _mm256_setzero_si256();
3329
0
        int sumi1 = 0;
3330
0
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3331
0
#ifdef __BMI2__
3332
0
            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
3333
0
            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
3334
0
            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
3335
0
            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
3336
0
            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
3337
0
            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
3338
#else
3339
            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
3340
                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
3341
            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
3342
                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
3343
#endif
3344
0
            qs += 8;
3345
0
            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3346
0
            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3347
3348
0
            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
3349
0
            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
3350
0
            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
3351
0
            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
3352
0
            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
3353
0
            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
3354
3355
0
            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
3356
0
            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
3357
0
                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3358
0
        }
3359
3360
0
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
3361
0
        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
3362
0
        accum1 += d * sumi1;
3363
3364
0
    }
3365
3366
0
    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3367
3368
#elif defined __AVX__
3369
    __m256 accum = _mm256_setzero_ps();
3370
    float accum1 = 0;
3371
    for (int i = 0; i < nb; ++i) {
3372
3373
        const int8_t   * q8 = y[i].qs;
3374
        const uint8_t  * qs = x[i].qs;
3375
        const uint16_t * qh = x[i].qh;
3376
3377
        __m128i sumi1_0 = _mm_setzero_si128();
3378
        __m128i sumi1_1 = _mm_setzero_si128();
3379
        int sumi1 = 0;
3380
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3381
            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
3382
            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
3383
            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
3384
            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
3385
            qs += 8;
3386
            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3387
            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3388
            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3389
            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3390
3391
            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
3392
            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
3393
            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
3394
            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
3395
            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
3396
            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
3397
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
3398
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
3399
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
3400
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
3401
3402
            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
3403
            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
3404
            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
3405
                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3406
        }
3407
3408
        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
3409
        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
3410
        accum1 += d * sumi1;
3411
3412
    }
3413
3414
    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3415
3416
#else
3417
    UNUSED(x);
3418
    UNUSED(y);
3419
    UNUSED(nb);
3420
    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3421
#endif
3422
0
}
3423
3424
0
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3425
0
    assert(n % QK_K == 0);
3426
0
    assert(nrc == 1);
3427
0
    UNUSED(nrc);
3428
0
    UNUSED(bx);
3429
0
    UNUSED(by);
3430
0
    UNUSED(bs);
3431
3432
0
    const block_iq1_m * GGML_RESTRICT x = vx;
3433
0
    const block_q8_K  * GGML_RESTRICT y = vy;
3434
3435
0
    const int nb = n / QK_K;
3436
3437
0
    iq1m_scale_t scale;
3438
3439
0
#if defined __AVX2__
3440
3441
0
    const __m256i mask = _mm256_set1_epi16(0x7);
3442
0
    const __m256i mone = _mm256_set1_epi16(1);
3443
0
    const __m256i mone8 = _mm256_set1_epi8(1);
3444
0
    const __m256i mtwo8 = _mm256_set1_epi8(2);
3445
    // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
3446
0
    const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
3447
3448
0
    __m256 accum1 = _mm256_setzero_ps();
3449
0
    __m256 accum2 = _mm256_setzero_ps();
3450
0
    for (int i = 0; i < nb; ++i) {
3451
3452
0
        const int8_t   * q8 = y[i].qs;
3453
0
        const uint8_t  * qs = x[i].qs;
3454
0
        const uint8_t  * qh = x[i].qh;
3455
0
        const uint16_t * sc = (const uint16_t *)x[i].scales;
3456
3457
0
        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3458
        // Extract 3-bit scales (16 values)
3459
0
        __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
3460
0
        scales = _mm256_srlv_epi64(scales, scales_shift);
3461
0
        scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
3462
3463
        // Indices to repeat each scale 8 times.
3464
0
        __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
3465
0
        __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
3466
3467
0
        __m256i sumi1 = _mm256_setzero_si256();
3468
0
        __m256i sumi2 = _mm256_setzero_si256();
3469
0
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3470
0
#ifdef __BMI2__
3471
0
            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
3472
0
                                       | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
3473
0
            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
3474
0
                                       | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
3475
0
            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
3476
0
            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
3477
0
            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
3478
0
            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
3479
3480
            // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
3481
0
            const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
3482
0
            const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
3483
0
            const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
3484
#else
3485
            const __m256i q1b_1 = _mm256_set_epi64x(
3486
                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
3487
                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
3488
            );
3489
            const __m256i q1b_2 = _mm256_set_epi64x(
3490
                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
3491
                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
3492
            );
3493
3494
            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3495
                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
3496
                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3497
                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3498
            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3499
                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
3500
                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3501
                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3502
#endif
3503
0
            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3504
0
            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3505
3506
0
            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
3507
0
            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
3508
0
            const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
3509
0
            const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
3510
3511
0
            __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
3512
0
            __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
3513
3514
0
            scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
3515
0
            scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
3516
3517
0
            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
3518
0
            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
3519
0
            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
3520
0
            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
3521
3522
0
            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
3523
0
            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
3524
3525
0
            qs += 8; qh += 4;
3526
0
        }
3527
3528
0
        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
3529
3530
0
        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
3531
0
        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
3532
0
    }
3533
3534
0
    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
3535
3536
#elif defined __AVX__
3537
    const __m128i mask = _mm_set1_epi16(0x7);
3538
    const __m128i mone = _mm_set1_epi16(1);
3539
3540
    __m256 accum1 = _mm256_setzero_ps();
3541
    __m256 accum2 = _mm256_setzero_ps();
3542
    for (int i = 0; i < nb; ++i) {
3543
3544
        const int8_t   * q8 = y[i].qs;
3545
        const uint8_t  * qs = x[i].qs;
3546
        const uint8_t  * qh = x[i].qh;
3547
        const uint16_t * sc = (const uint16_t *)x[i].scales;
3548
3549
        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3550
3551
        __m128i sumi1_0 = _mm_setzero_si128();
3552
        __m128i sumi1_1 = _mm_setzero_si128();
3553
        __m128i sumi2_0 = _mm_setzero_si128();
3554
        __m128i sumi2_1 = _mm_setzero_si128();
3555
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3556
            const __m128i q1b_1_0 = _mm_set_epi64x(
3557
                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
3558
            const __m128i q1b_1_1 = _mm_set_epi64x(
3559
                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
3560
            const __m128i q1b_2_0 = _mm_set_epi64x(
3561
                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
3562
            const __m128i q1b_2_1 = _mm_set_epi64x(
3563
                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
3564
            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3565
            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3566
            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3567
            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3568
3569
            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
3570
            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
3571
            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
3572
            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
3573
3574
            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3575
                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3576
            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3577
                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3578
            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3579
                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3580
            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3581
                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3582
3583
            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
3584
            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
3585
            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
3586
            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
3587
3588
            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
3589
            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
3590
            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
3591
            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
3592
3593
            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
3594
            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
3595
            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
3596
            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
3597
            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
3598
            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
3599
            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
3600
            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
3601
            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
3602
            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
3603
            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
3604
            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
3605
3606
            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
3607
            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
3608
            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
3609
            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
3610
3611
            qs += 8; qh += 4;
3612
        }
3613
3614
        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
3615
3616
        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
3617
        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
3618
    }
3619
3620
    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
3621
3622
#else
3623
    UNUSED(x);
3624
    UNUSED(y);
3625
    UNUSED(nb);
3626
    UNUSED(scale);
3627
    ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3628
#endif
3629
0
}
3630
3631
0
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3632
0
    assert(nrc == 1);
3633
0
    UNUSED(nrc);
3634
0
    UNUSED(bx);
3635
0
    UNUSED(by);
3636
0
    UNUSED(bs);
3637
0
    assert(n % QK4_NL == 0);
3638
0
    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
3639
3640
0
    const block_iq4_nl * GGML_RESTRICT x = vx;
3641
0
    const block_q8_0   * GGML_RESTRICT y = vy;
3642
3643
0
    const int nb = n / QK4_NL;
3644
3645
0
    int ib = 0;
3646
0
    float sumf = 0;
3647
3648
0
#if defined __AVX2__
3649
3650
0
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
3651
0
    const __m128i m4b  = _mm_set1_epi8(0x0f);
3652
0
    const __m256i mone = _mm256_set1_epi16(1);
3653
3654
0
    __m256 accum1 = _mm256_setzero_ps();
3655
0
    __m256 accum2 = _mm256_setzero_ps();
3656
0
    for (; ib + 1 < nb; ib += 2) {
3657
0
        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
3658
0
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
3659
0
        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
3660
0
        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
3661
0
        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
3662
0
                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
3663
0
        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
3664
0
                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
3665
0
        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
3666
0
        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
3667
0
        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
3668
0
        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
3669
0
        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
3670
0
                _mm256_cvtepi32_ps(p_1), accum1);
3671
0
        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
3672
0
                _mm256_cvtepi32_ps(p_2), accum2);
3673
0
    }
3674
3675
0
    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
3676
3677
#elif defined __AVX__
3678
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
3679
    const __m128i m4b  = _mm_set1_epi8(0x0f);
3680
3681
    __m256 accum = _mm256_setzero_ps();
3682
    for (; ib + 1 < nb; ib += 2) {
3683
        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
3684
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
3685
        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
3686
        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
3687
        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
3688
        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
3689
3690
        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
3691
        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
3692
        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
3693
        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
3694
3695
        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
3696
        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
3697
        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
3698
    }
3699
3700
    sumf = hsum_float_8(accum);
3701
3702
#endif
3703
0
    for (; ib < nb; ++ib) {
3704
0
        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
3705
0
        int sumi1 = 0, sumi2 = 0;
3706
0
        for (int j = 0; j < QK4_NL/2; ++j) {
3707
0
            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
3708
0
            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
3709
0
        }
3710
0
        sumf += d * (sumi1 + sumi2);
3711
0
    }
3712
0
    *s = sumf;
3713
0
}
3714
3715
0
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3716
0
    assert(nrc == 1);
3717
0
    UNUSED(nrc);
3718
0
    UNUSED(bx);
3719
0
    UNUSED(by);
3720
0
    UNUSED(bs);
3721
0
    assert(n % QK_K == 0);
3722
3723
0
    const block_iq4_xs * GGML_RESTRICT x = vx;
3724
0
    const block_q8_K   * GGML_RESTRICT y = vy;
3725
3726
0
    const int nb = n / QK_K;
3727
3728
0
#if defined __AVX2__
3729
3730
0
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
3731
0
    const __m128i m4b  = _mm_set1_epi8(0x0f);
3732
3733
0
    __m256 accum = _mm256_setzero_ps();
3734
0
    for (int ibl = 0; ibl < nb; ++ibl) {
3735
0
        const uint8_t * qs = x[ibl].qs;
3736
0
        const int8_t  * q8 = y[ibl].qs;
3737
0
        uint16_t sh = x[ibl].scales_h;
3738
0
        __m256i sumi1 = _mm256_setzero_si256();
3739
0
        __m256i sumi2 = _mm256_setzero_si256();
3740
0
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3741
0
            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
3742
0
            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
3743
0
            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3744
0
            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3745
0
            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
3746
0
                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
3747
0
            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
3748
0
                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
3749
0
            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
3750
0
            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
3751
0
            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
3752
0
            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
3753
0
            sh >>= 4;
3754
0
            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
3755
0
            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
3756
0
            sumi1 = _mm256_add_epi32(p_1, sumi1);
3757
0
            sumi2 = _mm256_add_epi32(p_2, sumi2);
3758
0
        }
3759
0
        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
3760
0
                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
3761
0
    }
3762
3763
0
    *s = hsum_float_8(accum);
3764
3765
#elif defined __AVX__
3766
    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
3767
    const __m128i m4b  = _mm_set1_epi8(0x0f);
3768
3769
    __m256 accum = _mm256_setzero_ps();
3770
    for (int ibl = 0; ibl < nb; ++ibl) {
3771
        const uint8_t * qs = x[ibl].qs;
3772
        const int8_t  * q8 = y[ibl].qs;
3773
        uint16_t sh = x[ibl].scales_h;
3774
        __m128i sumi1_0 = _mm_setzero_si128();
3775
        __m128i sumi1_1 = _mm_setzero_si128();
3776
        __m128i sumi2_0 = _mm_setzero_si128();
3777
        __m128i sumi2_1 = _mm_setzero_si128();
3778
        for (int ib = 0; ib < QK_K/32; ib += 2) {
3779
            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
3780
            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
3781
            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3782
            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3783
            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3784
            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3785
            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
3786
            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
3787
            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
3788
            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
3789
            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
3790
            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
3791
            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
3792
            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
3793
            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
3794
            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
3795
            sh >>= 4;
3796
            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
3797
            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
3798
            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
3799
            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
3800
            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
3801
            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
3802
            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
3803
            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
3804
        }
3805
        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
3806
        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
3807
        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
3808
                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
3809
    }
3810
3811
    *s = hsum_float_8(accum);
3812
3813
#else
3814
    UNUSED(x);
3815
    UNUSED(y);
3816
    UNUSED(nb);
3817
    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3818
#endif
3819
0
}
3820