Coverage Report

Created: 2026-01-10 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
Line
Count
Source
1
#pragma once
2
3
#include "ggml-cpu-impl.h"
4
5
#ifdef __ARM_FEATURE_SVE
6
#include <arm_sve.h>
7
#endif // __ARM_FEATURE_SVE
8
9
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
//
12
//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
//
14
#include <arm_neon.h>
15
#endif
16
17
#if defined(__riscv_v_intrinsic)
18
#include <riscv_vector.h>
19
#endif
20
21
#ifdef __cplusplus
22
extern "C" {
23
#endif
24
25
//
26
// simd mappings
27
//
28
29
// FP16 to FP32 conversion
30
31
// 16-bit float
32
// on Arm, we use __fp16
33
// on x86, we use uint16_t
34
//
35
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
//
38
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
42
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
44
    static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45
        __fp16 tmp;
46
        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47
        return (float)tmp;
48
    }
49
50
    static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
        ggml_fp16_t res;
52
        __fp16 tmp = f;
53
        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54
        return res;
55
    }
56
#elif defined(__F16C__)
57
    #ifdef _MSC_VER
58
        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
    #else
61
        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
    #endif
64
#elif defined(__POWER9_VECTOR__)
65
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
    /* the inline asm below is about 12% faster than the lookup method */
68
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
71
    static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72
        float f;
73
        double d;
74
        __asm__(
75
            "mtfprd %0,%2\n"
76
            "xscvhpdp %0,%0\n"
77
            "frsp %1,%0\n" :
78
            /* temp */ "=d"(d),
79
            /* out */  "=f"(f):
80
            /* in */   "r"(h));
81
        return f;
82
    }
83
84
    static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
        double d;
86
        ggml_fp16_t r;
87
        __asm__( /* xscvdphp can work on double or single precision */
88
            "xscvdphp %0,%2\n"
89
            "mffprd %1,%0\n" :
90
            /* temp */ "=d"(d),
91
            /* out */  "=r"(r):
92
            /* in */   "f"(f));
93
        return r;
94
    }
95
#elif defined(__riscv) && defined(__riscv_zfhmin)
96
    static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
        _Float16 hf;
98
        memcpy(&hf, &h, sizeof(ggml_fp16_t));
99
        return hf;
100
    }
101
102
    static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
103
        ggml_fp16_t res;
104
        _Float16 hf = (_Float16)f;
105
        memcpy(&res, &hf, sizeof(ggml_fp16_t));
106
        return res;
107
    }
108
109
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
110
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
111
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
112
    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
113
#endif
114
115
// precomputed f32 table for f16 (256 KB)
116
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
117
extern float ggml_table_f32_f16[1 << 16];
118
119
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
120
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
121
// This is also true for POWER9.
122
#if !defined(GGML_CPU_FP16_TO_FP32)
123
0
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
124
0
    uint16_t s;
125
0
    memcpy(&s, &f, sizeof(uint16_t));
126
0
    return ggml_table_f32_f16[s];
127
0
}
Unexecuted instantiation: repack.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: ggml-cpu.c:ggml_lookup_fp16_to_fp32
Unexecuted instantiation: quants.c:ggml_lookup_fp16_to_fp32
Unexecuted instantiation: binary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: unary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: vec.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: sgemm.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
128
129
0
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
130
#endif
131
132
#if !defined(GGML_CPU_FP32_TO_FP16)
133
131k
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
134
#endif
135
136
137
// we define a common set of C macros which map to specific intrinsics based on the current architecture
138
// we then implement the fundamental computation operations below using only these macros
139
// adding support for new architectures requires to define the corresponding SIMD macros
140
//
141
// GGML_F32_STEP / GGML_F16_STEP
142
//   number of elements to process in a single step
143
//
144
// GGML_F32_EPR / GGML_F16_EPR
145
//   number of elements to fit in a single register
146
//
147
148
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
149
150
#define GGML_SIMD
151
152
// F32 SVE
153
#define GGML_F32_EPR 8
154
#define DEFAULT_PG svptrue_b32()
155
156
#define GGML_F32xt                        svfloat32_t
157
#define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
158
#define GGML_F32xt_SET1(x)                svdup_n_f32(x)
159
#define GGML_F32xt_LOAD_IMPL(pg, a)       svld1_f32(pg, a)
160
#define GGML_F32xt_LOAD(a)                GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
161
#define GGML_F32xt_STORE_IMPL(pg, a, b)   svst1_f32(pg, a, b)
162
#define GGML_F32xt_STORE(a, b)            GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
163
#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
164
#define GGML_F32xt_FMA(a, b, c)           GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
165
#define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
166
#define GGML_F32xt_ADD(a, b)              GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
167
#define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
168
#define GGML_F32xt_MUL(a, b)              GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
169
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
170
#define GGML_F32xt_REDUCE_ONE(a)          GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
171
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
172
{                                                      \
173
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
174
    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
175
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
176
    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
177
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
178
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
179
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
180
    (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
181
}
182
#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
183
        GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
184
185
#define GGML_F32_VEC        GGML_F32xt
186
#define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
187
#define GGML_F32_VEC_SET1   GGML_F32xt_SET1
188
#define GGML_F32_VEC_LOAD   GGML_F32xt_LOAD
189
#define GGML_F32_VEC_STORE  GGML_F32xt_STORE
190
#define GGML_F32_VEC_FMA    GGML_F32xt_FMA
191
#define GGML_F32_VEC_ADD    GGML_F32xt_ADD
192
#define GGML_F32_VEC_MUL    GGML_F32xt_MUL
193
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
194
195
// F16 SVE
196
#define DEFAULT_PG32    svptrue_b32()
197
#define DEFAULT_PG16    svptrue_b16()
198
199
#define GGML_F32Cxt                         svfloat16_t
200
#define GGML_F32Cxt_ZERO                    svdup_n_f16(0.0f)
201
#define GGML_F32Cxt_SET1(x)                 svdup_n_f16(x)
202
#define GGML_F32Cxt_LOAD(p)                 svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
203
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
204
205
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c)   svmad_f16_x(pg, b, c, a)
206
#define GGML_F32Cxt_FMA(a, b, c)            GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
207
#define GGML_F32Cxt_ADD_IMPL(pg, a, b)      svadd_f16_x(pg, a, b)
208
#define GGML_F32Cxt_ADD(a, b)               GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
209
#define GGML_F32Cxt_MUL_IMPL(pg, a, b)      svmul_f16_x(pg, a, b)
210
#define GGML_F32Cxt_MUL(a, b)               GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
211
#define GGML_F32Cxt_REDUCE                  GGML_F16xt_REDUCE_MIXED
212
213
#define GGML_F16x_VEC                GGML_F32Cxt
214
#define GGML_F16x_VEC_ZERO           GGML_F32Cxt_ZERO
215
#define GGML_F16x_VEC_SET1           GGML_F32Cxt_SET1
216
#define GGML_F16x_VEC_LOAD(p, i)     GGML_F32Cxt_LOAD(p)
217
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
218
#define GGML_F16x_VEC_FMA            GGML_F32Cxt_FMA
219
#define GGML_F16x_VEC_ADD            GGML_F32Cxt_ADD
220
#define GGML_F16x_VEC_MUL            GGML_F32Cxt_MUL
221
#define GGML_F16x_VEC_REDUCE         GGML_F32Cxt_REDUCE
222
223
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
224
#define GGML_F16xt_REDUCE_ONE(a)          GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
225
226
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4)  \
227
{                                                      \
228
    sum1 = svadd_f16_x(pg16, sum1, sum2);              \
229
    sum3 = svadd_f16_x(pg16, sum3, sum4);              \
230
    sum1 = svadd_f16_x(pg16, sum1, sum3);              \
231
    __fp16 sum_f16 = svaddv_f16(pg16, sum1);           \
232
    (res) = (ggml_float) sum_f16;                      \
233
}
234
#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4)  \
235
        GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
236
237
// F16 NEON
238
239
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
240
    #define GGML_F16_STEP 32
241
    #define GGML_F16_EPR  8
242
243
    #define GGML_F16x8              float16x8_t
244
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
245
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
246
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
247
    #define GGML_F16x8_STORE        vst1q_f16
248
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
249
    #define GGML_F16x8_ADD          vaddq_f16
250
    #define GGML_F16x8_MUL          vmulq_f16
251
    #define GGML_F16x8_REDUCE(res, x)                               \
252
    do {                                                            \
253
        int offset = GGML_F16_ARR >> 1;                             \
254
        for (int i = 0; i < offset; ++i) {                          \
255
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
256
        }                                                           \
257
        offset >>= 1;                                               \
258
        for (int i = 0; i < offset; ++i) {                          \
259
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
260
        }                                                           \
261
        offset >>= 1;                                               \
262
        for (int i = 0; i < offset; ++i) {                          \
263
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
264
        }                                                           \
265
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
266
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
267
        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
268
    } while (0)
269
270
    #define GGML_F16_VEC                GGML_F16x8
271
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
272
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
273
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
274
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
275
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
276
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
277
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
278
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
279
#else
280
    // if FP16 vector arithmetic is not supported, we use FP32 instead
281
    // and take advantage of the vcvt_ functions to convert to/from FP16
282
283
    #define GGML_F16_STEP 16
284
    #define GGML_F16_EPR  4
285
286
    #define GGML_F32Cx4              float32x4_t
287
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
288
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
289
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
290
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
291
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
292
    #define GGML_F32Cx4_ADD          vaddq_f32
293
    #define GGML_F32Cx4_MUL          vmulq_f32
294
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
295
296
    #define GGML_F16_VEC                GGML_F32Cx4
297
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
298
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
299
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
300
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
301
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
302
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
303
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
304
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
305
#endif
306
307
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
308
309
#define GGML_SIMD
310
311
// F32 NEON
312
313
#define GGML_F32_STEP 16
314
#define GGML_F32_EPR  4
315
316
#define GGML_F32x4              float32x4_t
317
#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
318
#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
319
#define GGML_F32x4_LOAD         vld1q_f32
320
#define GGML_F32x4_STORE        vst1q_f32
321
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
322
#define GGML_F32x4_ADD          vaddq_f32
323
#define GGML_F32x4_MUL          vmulq_f32
324
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
325
#define GGML_F32x4_REDUCE(res, x)                       \
326
{                                                       \
327
    int offset = GGML_F32_ARR >> 1;                     \
328
    for (int i = 0; i < offset; ++i) {                  \
329
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
330
    }                                                   \
331
    offset >>= 1;                                       \
332
    for (int i = 0; i < offset; ++i) {                  \
333
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
334
    }                                                   \
335
    offset >>= 1;                                       \
336
    for (int i = 0; i < offset; ++i) {                  \
337
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
338
    }                                                   \
339
    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
340
}
341
342
#define GGML_F32_VEC        GGML_F32x4
343
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
344
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
345
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
346
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
347
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
348
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
349
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
350
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
351
352
// F16 NEON
353
354
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
355
    #define GGML_F16_STEP 32
356
    #define GGML_F16_EPR  8
357
358
    #define GGML_F16x8              float16x8_t
359
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
360
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
361
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
362
    #define GGML_F16x8_STORE        vst1q_f16
363
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
364
    #define GGML_F16x8_ADD          vaddq_f16
365
    #define GGML_F16x8_MUL          vmulq_f16
366
    #define GGML_F16x8_REDUCE(res, x)                               \
367
    do {                                                            \
368
        int offset = GGML_F16_ARR >> 1;                             \
369
        for (int i = 0; i < offset; ++i) {                          \
370
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
371
        }                                                           \
372
        offset >>= 1;                                               \
373
        for (int i = 0; i < offset; ++i) {                          \
374
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
375
        }                                                           \
376
        offset >>= 1;                                               \
377
        for (int i = 0; i < offset; ++i) {                          \
378
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
379
        }                                                           \
380
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
381
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
382
        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
383
    } while (0)
384
385
    #define GGML_F16_VEC                GGML_F16x8
386
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
387
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
388
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
389
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
390
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
391
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
392
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
393
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
394
#else
395
    // if FP16 vector arithmetic is not supported, we use FP32 instead
396
    // and take advantage of the vcvt_ functions to convert to/from FP16
397
398
    #define GGML_F16_STEP 16
399
    #define GGML_F16_EPR  4
400
401
    #define GGML_F32Cx4              float32x4_t
402
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
403
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
404
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
405
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
406
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
407
    #define GGML_F32Cx4_ADD          vaddq_f32
408
    #define GGML_F32Cx4_MUL          vmulq_f32
409
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
410
411
    #define GGML_F16_VEC                GGML_F32Cx4
412
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
413
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
414
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
415
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
416
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
417
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
418
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
419
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
420
#endif
421
422
#elif defined(__AVX512F__)
423
424
#define GGML_SIMD
425
426
// F32 AVX512
427
428
#define GGML_F32_STEP 64
429
#define GGML_F32_EPR  16
430
431
#define GGML_F32x16         __m512
432
#define GGML_F32x16_ZERO    _mm512_setzero_ps()
433
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
434
#define GGML_F32x16_LOAD    _mm512_loadu_ps
435
#define GGML_F32x16_STORE   _mm512_storeu_ps
436
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
437
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
438
#define GGML_F32x16_ADD     _mm512_add_ps
439
#define GGML_F32x16_MUL     _mm512_mul_ps
440
#define GGML_F32x16_REDUCE(res, x)                                    \
441
do {                                                                  \
442
    int offset = GGML_F32_ARR >> 1;                                   \
443
    for (int i = 0; i < offset; ++i) {                                \
444
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
445
    }                                                                 \
446
    offset >>= 1;                                                     \
447
    for (int i = 0; i < offset; ++i) {                                \
448
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
449
    }                                                                 \
450
    offset >>= 1;                                                     \
451
    for (int i = 0; i < offset; ++i) {                                \
452
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
453
    }                                                                 \
454
    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
455
} while (0)
456
457
// TODO: is this optimal ?
458
459
#define GGML_F32_VEC        GGML_F32x16
460
#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
461
#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
462
#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
463
#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
464
#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
465
#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
466
#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
467
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
468
469
// F16 AVX512
470
471
// F16 AVX
472
473
#define GGML_F16_STEP 64
474
#define GGML_F16_EPR  16
475
476
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
477
478
#define GGML_F32Cx16             __m512
479
#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
480
#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
481
482
// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
483
// so F16C guard isn't required
484
#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
485
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
486
487
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
488
#define GGML_F32Cx16_ADD         _mm512_add_ps
489
#define GGML_F32Cx16_MUL         _mm512_mul_ps
490
#define GGML_F32Cx16_REDUCE(res, x)                               \
491
do {                                                              \
492
    int offset = GGML_F32_ARR >> 1;                               \
493
    for (int i = 0; i < offset; ++i) {                            \
494
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
495
    }                                                             \
496
    offset >>= 1;                                                 \
497
    for (int i = 0; i < offset; ++i) {                            \
498
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
499
    }                                                             \
500
    offset >>= 1;                                                 \
501
    for (int i = 0; i < offset; ++i) {                            \
502
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
503
    }                                                             \
504
    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
505
} while (0)
506
507
#define GGML_F16_VEC                GGML_F32Cx16
508
#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
509
#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
510
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
511
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
512
#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
513
#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
514
#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
515
516
#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
517
#elif defined(__AVX__)
518
519
#define GGML_SIMD
520
521
// F32 AVX
522
523
0
#define GGML_F32_STEP 32
524
0
#define GGML_F32_EPR  8
525
526
0
#define GGML_F32x8         __m256
527
0
#define GGML_F32x8_ZERO    _mm256_setzero_ps()
528
0
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
529
0
#define GGML_F32x8_LOAD    _mm256_loadu_ps
530
0
#define GGML_F32x8_STORE   _mm256_storeu_ps
531
#if defined(__FMA__)
532
0
    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
533
#else
534
    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
535
#endif
536
0
#define GGML_F32x8_ADD     _mm256_add_ps
537
0
#define GGML_F32x8_MUL     _mm256_mul_ps
538
0
#define GGML_F32x8_REDUCE(res, x)                                 \
539
0
do {                                                              \
540
0
    int offset = GGML_F32_ARR >> 1;                               \
541
0
    for (int i = 0; i < offset; ++i) {                            \
542
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
543
0
    }                                                             \
544
0
    offset >>= 1;                                                 \
545
0
    for (int i = 0; i < offset; ++i) {                            \
546
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
547
0
    }                                                             \
548
0
    offset >>= 1;                                                 \
549
0
    for (int i = 0; i < offset; ++i) {                            \
550
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
551
0
    }                                                             \
552
0
    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
553
0
                                 _mm256_extractf128_ps(x[0], 1)); \
554
0
    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
555
0
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
556
0
} while (0)
557
// TODO: is this optimal ?
558
559
0
#define GGML_F32_VEC        GGML_F32x8
560
0
#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
561
0
#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
562
0
#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
563
0
#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
564
0
#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
565
0
#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
566
0
#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
567
0
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
568
569
// F16 AVX
570
571
0
#define GGML_F16_STEP 32
572
0
#define GGML_F16_EPR  8
573
574
// F16 arithmetic is not supported by AVX, so we use F32 instead
575
576
0
#define GGML_F32Cx8             __m256
577
0
#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
578
0
#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
579
580
#if defined(__F16C__)
581
// the  _mm256_cvt intrinsics require F16C
582
0
#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
583
0
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
584
#else
585
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
586
    float tmp[8];
587
588
    for (int i = 0; i < 8; i++) {
589
        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
590
    }
591
592
    return _mm256_loadu_ps(tmp);
593
}
594
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
595
    float arr[8];
596
597
    _mm256_storeu_ps(arr, y);
598
599
    for (int i = 0; i < 8; i++)
600
        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
601
}
602
#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
603
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
604
#endif
605
606
0
#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
607
#define GGML_F32Cx8_ADD         _mm256_add_ps
608
0
#define GGML_F32Cx8_MUL         _mm256_mul_ps
609
0
#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
610
611
0
#define GGML_F16_VEC                GGML_F32Cx8
612
0
#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
613
0
#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
614
0
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
615
0
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
616
0
#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
617
#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
618
0
#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
619
0
#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
620
621
#elif defined(__POWER9_VECTOR__)
622
623
#define GGML_SIMD
624
625
// F32 POWER9
626
627
#define GGML_F32_STEP 32
628
#define GGML_F32_EPR  4
629
630
#define GGML_F32x4              vector float
631
#define GGML_F32x4_ZERO         {0.0f}
632
#define GGML_F32x4_SET1         vec_splats
633
#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
634
#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
635
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
636
#define GGML_F32x4_ADD          vec_add
637
#define GGML_F32x4_MUL          vec_mul
638
#define GGML_F32x4_REDUCE(res, x)              \
639
{                                              \
640
    int offset = GGML_F32_ARR >> 1;            \
641
    for (int i = 0; i < offset; ++i) {         \
642
        x[i] = vec_add(x[i], x[offset+i]);     \
643
    }                                          \
644
    offset >>= 1;                              \
645
    for (int i = 0; i < offset; ++i) {         \
646
        x[i] = vec_add(x[i], x[offset+i]);     \
647
    }                                          \
648
    offset >>= 1;                              \
649
    for (int i = 0; i < offset; ++i) {         \
650
        x[i] = vec_add(x[i], x[offset+i]);     \
651
    }                                          \
652
    res = vec_extract(x[0], 0) +               \
653
          vec_extract(x[0], 1) +               \
654
          vec_extract(x[0], 2) +               \
655
          vec_extract(x[0], 3);                \
656
}
657
658
#define GGML_F32_VEC        GGML_F32x4
659
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
660
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
661
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
662
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
663
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
664
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
665
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
666
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
667
668
// F16 POWER9
669
#define GGML_F16_STEP       GGML_F32_STEP
670
#define GGML_F16_EPR        GGML_F32_EPR
671
#define GGML_F16_VEC        GGML_F32x4
672
#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
673
#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
674
#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
675
#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
676
#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
677
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
678
// Use vec_xl, not vec_ld, in case the load address is not aligned.
679
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
680
  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
681
  vec_extract_fp32_from_shortl(vec_xl(0, p))
682
static inline unsigned char ggml_endian_byte(int i) {
683
       uint16_t tmp_val = 1;
684
       return ((unsigned char *)&tmp_val)[i];
685
}
686
#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
687
#define GGML_F16_VEC_STORE(p, r, i)                             \
688
  if (i & 0x1)                                                  \
689
    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
690
                                   r[i - GGML_ENDIAN_BYTE(0)]), \
691
            0, p - GGML_F16_EPR)
692
693
#elif defined(__wasm_simd128__)
694
695
#define GGML_SIMD
696
697
// F32 WASM
698
699
#define GGML_F32_STEP 16
700
#define GGML_F32_EPR  4
701
702
#define GGML_F32x4              v128_t
703
#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
704
#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
705
#define GGML_F32x4_LOAD         wasm_v128_load
706
#define GGML_F32x4_STORE        wasm_v128_store
707
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
708
#define GGML_F32x4_ADD          wasm_f32x4_add
709
#define GGML_F32x4_MUL          wasm_f32x4_mul
710
#define GGML_F32x4_REDUCE(res, x)                  \
711
{                                                  \
712
    int offset = GGML_F32_ARR >> 1;                \
713
    for (int i = 0; i < offset; ++i) {             \
714
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
715
    }                                              \
716
    offset >>= 1;                                  \
717
    for (int i = 0; i < offset; ++i) {             \
718
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
719
    }                                              \
720
    offset >>= 1;                                  \
721
    for (int i = 0; i < offset; ++i) {             \
722
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
723
    }                                              \
724
    res = wasm_f32x4_extract_lane(x[0], 0) +       \
725
          wasm_f32x4_extract_lane(x[0], 1) +       \
726
          wasm_f32x4_extract_lane(x[0], 2) +       \
727
          wasm_f32x4_extract_lane(x[0], 3);        \
728
}
729
730
#define GGML_F32_VEC        GGML_F32x4
731
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
732
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
733
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
734
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
735
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
736
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
737
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
738
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
739
740
// F16 WASM
741
742
#define GGML_F16_STEP 16
743
#define GGML_F16_EPR  4
744
745
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
746
    float tmp[4];
747
748
    tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
749
    tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
750
    tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
751
    tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
752
753
    return wasm_v128_load(tmp);
754
}
755
756
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
757
    float tmp[4];
758
759
    wasm_v128_store(tmp, x);
760
761
    p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
762
    p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
763
    p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
764
    p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
765
}
766
767
#define GGML_F16x4             v128_t
768
#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
769
#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
770
#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
771
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
772
#define GGML_F16x4_FMA         GGML_F32x4_FMA
773
#define GGML_F16x4_ADD         wasm_f32x4_add
774
#define GGML_F16x4_MUL         wasm_f32x4_mul
775
#define GGML_F16x4_REDUCE(res, x)                           \
776
{                                                           \
777
    int offset = GGML_F16_ARR >> 1;                         \
778
    for (int i = 0; i < offset; ++i) {                      \
779
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
780
    }                                                       \
781
    offset >>= 1;                                           \
782
    for (int i = 0; i < offset; ++i) {                      \
783
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
784
    }                                                       \
785
    offset >>= 1;                                           \
786
    for (int i = 0; i < offset; ++i) {                      \
787
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
788
    }                                                       \
789
    res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) +  \
790
          wasm_f32x4_extract_lane(x[0], 1) +                \
791
          wasm_f32x4_extract_lane(x[0], 2) +                \
792
          wasm_f32x4_extract_lane(x[0], 3));                \
793
}
794
795
#define GGML_F16_VEC                GGML_F16x4
796
#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
797
#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
798
#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
799
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
800
#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
801
#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
802
#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
803
#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
804
805
#elif defined(__SSE3__)
806
807
#define GGML_SIMD
808
809
// F32 SSE
810
811
#define GGML_F32_STEP 32
812
#define GGML_F32_EPR  4
813
814
#define GGML_F32x4         __m128
815
#define GGML_F32x4_ZERO    _mm_setzero_ps()
816
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
817
#define GGML_F32x4_LOAD    _mm_loadu_ps
818
#define GGML_F32x4_STORE   _mm_storeu_ps
819
#if defined(__FMA__)
820
    // TODO: Does this work?
821
    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
822
#else
823
    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
824
#endif
825
#define GGML_F32x4_ADD     _mm_add_ps
826
#define GGML_F32x4_MUL     _mm_mul_ps
827
#define GGML_F32x4_REDUCE(res, x)                                 \
828
{                                                                 \
829
    int offset = GGML_F32_ARR >> 1;                               \
830
    for (int i = 0; i < offset; ++i) {                            \
831
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
832
    }                                                             \
833
    offset >>= 1;                                                 \
834
    for (int i = 0; i < offset; ++i) {                            \
835
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
836
    }                                                             \
837
    offset >>= 1;                                                 \
838
    for (int i = 0; i < offset; ++i) {                            \
839
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
840
    }                                                             \
841
    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
842
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
843
}
844
// TODO: is this optimal ?
845
846
#define GGML_F32_VEC        GGML_F32x4
847
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
848
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
849
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
850
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
851
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
852
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
853
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
854
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
855
856
// F16 SSE
857
858
#define GGML_F16_STEP 32
859
#define GGML_F16_EPR  4
860
861
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
862
    float tmp[4];
863
864
    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
865
    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
866
    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
867
    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
868
869
    return _mm_loadu_ps(tmp);
870
}
871
872
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
873
    float arr[4];
874
875
    _mm_storeu_ps(arr, y);
876
877
    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
878
    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
879
    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
880
    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
881
}
882
883
#define GGML_F32Cx4             __m128
884
#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
885
#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
886
#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
887
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
888
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
889
#define GGML_F32Cx4_ADD         _mm_add_ps
890
#define GGML_F32Cx4_MUL         _mm_mul_ps
891
#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
892
893
#define GGML_F16_VEC                 GGML_F32Cx4
894
#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
895
#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
896
#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
897
#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
898
#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
899
#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
900
#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
901
#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
902
903
#elif defined(__loongarch_asx)
904
905
#define GGML_SIMD
906
907
// F32 LASX
908
#define GGML_F32_STEP 32
909
#define GGML_F32_EPR  8
910
911
#define GGML_F32x8         __m256
912
#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
913
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
914
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
915
#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
916
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
917
#define GGML_F32x8_ADD     __lasx_xvfadd_s
918
#define GGML_F32x8_MUL     __lasx_xvfmul_s
919
#define GGML_F32x8_REDUCE(res, x)                                 \
920
do {                                                              \
921
    int offset = GGML_F32_ARR >> 1;                               \
922
    for (int i = 0; i < offset; ++i) {                            \
923
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
924
    }                                                             \
925
    offset >>= 1;                                                 \
926
    for (int i = 0; i < offset; ++i) {                            \
927
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
928
    }                                                             \
929
    offset >>= 1;                                                 \
930
    for (int i = 0; i < offset; ++i) {                            \
931
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
932
    }                                                             \
933
    float *tmp_p = (float *)&x[0]; \
934
    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
935
} while (0)
936
// TODO: is this optimal ?
937
938
#define GGML_F32_VEC        GGML_F32x8
939
#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
940
#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
941
#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
942
#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
943
#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
944
#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
945
#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
946
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
947
948
// F16 LASX
949
950
#define GGML_F16_STEP 32
951
#define GGML_F16_EPR  8
952
953
// F16 arithmetic is not supported by LASX, so we use F32 instead
954
955
#define GGML_F32Cx8          __m256
956
#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
957
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
958
959
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
960
    __m256i a;
961
    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
962
    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
963
    return __lasx_xvfcvtl_s_h(a);
964
}
965
966
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
967
    __m256i a = __lasx_xvfcvt_h_s(y, y);
968
    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
969
    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
970
}
971
#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
972
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
973
974
#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
975
#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
976
#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
977
#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
978
979
#define GGML_F16_VEC                GGML_F32Cx8
980
#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
981
#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
982
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
983
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
984
#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
985
#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
986
#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
987
#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
988
989
#elif defined(__loongarch_sx)
990
991
#define GGML_SIMD
992
993
// F32 LSX
994
995
#define GGML_F32_STEP 32
996
#define GGML_F32_EPR  4
997
998
#define GGML_F32x4         __m128
999
#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
1000
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1001
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1002
#define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
1003
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1004
#define GGML_F32x4_ADD     __lsx_vfadd_s
1005
#define GGML_F32x4_MUL     __lsx_vfmul_s
1006
1007
#define GGML_F32x4_REDUCE(res, x)                               \
1008
{                                                               \
1009
    int offset = GGML_F32_ARR >> 1;                             \
1010
    for (int i = 0; i < offset; ++i) {                          \
1011
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1012
    }                                                           \
1013
    offset >>= 1;                                               \
1014
    for (int i = 0; i < offset; ++i) {                          \
1015
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1016
    }                                                           \
1017
    offset >>= 1;                                               \
1018
    for (int i = 0; i < offset; ++i) {                          \
1019
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1020
    }                                                           \
1021
    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1022
    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1023
    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
1024
    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
1025
    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
1026
    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
1027
    res = (ggml_float) ((v4f32)t5)[0];                          \
1028
}
1029
1030
#define GGML_F32_VEC        GGML_F32x4
1031
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1032
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1033
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1034
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1035
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1036
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1037
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1038
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1039
1040
// F16 LSX
1041
1042
#define GGML_F16_STEP 32
1043
#define GGML_F16_EPR  4
1044
1045
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1046
    float tmp[4];
1047
1048
    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1049
    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1050
    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1051
    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1052
1053
    return (__m128)__lsx_vld(tmp, 0);
1054
}
1055
1056
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1057
    float arr[4];
1058
1059
    __lsx_vst(y, arr, 0);
1060
1061
    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1062
    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1063
    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1064
    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1065
}
1066
1067
#define GGML_F32Cx4             __m128
1068
#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
1069
#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
1070
#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
1071
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1072
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
1073
#define GGML_F32Cx4_ADD         __lsx_vfadd_s
1074
#define GGML_F32Cx4_MUL         __lsx_vfmul_s
1075
#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
1076
1077
#define GGML_F16_VEC                 GGML_F32Cx4
1078
#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
1079
#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
1080
#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
1081
#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
1082
#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
1083
#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
1084
#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
1085
#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
1086
1087
#elif defined(__VXE__) || defined(__VXE2__)
1088
1089
#define GGML_SIMD
1090
1091
// F32 s390x
1092
1093
#define GGML_F32_STEP 32
1094
#define GGML_F32_EPR  4
1095
1096
#define GGML_F32x4              float32x4_t
1097
#define GGML_F32x4_ZERO         vec_splats(0.0f)
1098
#define GGML_F32x4_SET1         vec_splats
1099
#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
1100
#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
1101
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1102
#define GGML_F32x4_ADD          vec_add
1103
#define GGML_F32x4_MUL          vec_mul
1104
#define GGML_F32x4_REDUCE(res, x)                   \
1105
{                                                   \
1106
    int offset = GGML_F32_ARR >> 1;                 \
1107
    for (int i = 0; i < offset; ++i) {              \
1108
        x[i] = vec_add(x[i], x[offset + i]);        \
1109
    }                                               \
1110
    offset >>= 1;                                   \
1111
    for (int i = 0; i < offset; ++i) {              \
1112
        x[i] = vec_add(x[i], x[offset + i]);        \
1113
    }                                               \
1114
    offset >>= 1;                                   \
1115
    for (int i = 0; i < offset; ++i) {              \
1116
        x[i] = vec_add(x[i], x[offset + i]);        \
1117
    }                                               \
1118
    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
1119
    res = tmp[0] + tmp[1];                          \
1120
}
1121
1122
#define GGML_F32_VEC        GGML_F32x4
1123
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1124
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1125
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1126
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1127
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1128
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1129
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1130
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1131
1132
// F16 s390x
1133
#define GGML_F16_STEP GGML_F32_STEP
1134
#define GGML_F16_EPR  GGML_F32_EPR
1135
1136
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1137
    float tmp[4];
1138
1139
    for (int i = 0; i < 4; i++) {
1140
        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1141
    }
1142
1143
    // note: keep type-cast here to prevent compiler bugs
1144
    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1145
    return vec_xl(0, (const float *)(tmp));
1146
}
1147
1148
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1149
    float arr[4];
1150
1151
    // note: keep type-cast here to prevent compiler bugs
1152
    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1153
    vec_xst(v_y, 0, (float *)(arr));
1154
1155
    for (int i = 0; i < 4; i++) {
1156
        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1157
    }
1158
}
1159
1160
#define GGML_F16_VEC                GGML_F32x4
1161
#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
1162
#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
1163
#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
1164
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1165
#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
1166
#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
1167
#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
1168
#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
1169
1170
#elif defined(__riscv_v_intrinsic)
1171
1172
// compatible with vlen >= 128
1173
1174
#define GGML_SIMD
1175
1176
// F32
1177
1178
#define GGML_F32_STEP 16
1179
#define GGML_F32_EPR  4
1180
1181
#define GGML_F32x4              vfloat32m1_t
1182
#define GGML_F32x4_ZERO         __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1183
#define GGML_F32x4_SET1(x)      __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1184
#define GGML_F32x4_LOAD(x)      __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1185
#define GGML_F32x4_STORE(b, v)  __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1186
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1187
#define GGML_F32x4_ADD(a, b)    __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1188
#define GGML_F32x4_MUL(a, b)    __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1189
1190
#define GGML_F32_VEC        GGML_F32x4
1191
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1192
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1193
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1194
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1195
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1196
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1197
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1198
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1199
1200
#endif
1201
1202
// GGML_F32_ARR / GGML_F16_ARR
1203
//   number of registers to use per step
1204
#ifdef GGML_SIMD
1205
0
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1206
0
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1207
#endif
1208
1209
#ifdef __cplusplus
1210
}
1211
#endif