Coverage Report

Created: 2026-03-21 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
Line
Count
Source
1
#pragma once
2
3
#include "ggml-cpu-impl.h"
4
5
#ifdef __ARM_FEATURE_SVE
6
#include <arm_sve.h>
7
#endif // __ARM_FEATURE_SVE
8
9
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
//
12
//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
//
14
#include <arm_neon.h>
15
#endif
16
17
#if defined(__riscv_v_intrinsic)
18
#include <riscv_vector.h>
19
#endif
20
21
#ifdef __cplusplus
22
extern "C" {
23
#endif
24
25
//
26
// simd mappings
27
//
28
29
// FP16 to FP32 conversion
30
31
// 16-bit float
32
// on Arm, we use __fp16
33
// on x86, we use uint16_t
34
//
35
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
//
38
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
42
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
44
    static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45
        __fp16 tmp;
46
        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47
        return (float)tmp;
48
    }
49
50
    static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
        ggml_fp16_t res;
52
        __fp16 tmp = f;
53
        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54
        return res;
55
    }
56
#elif defined(__F16C__)
57
    #ifdef _MSC_VER
58
        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
    #else
61
        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
    #endif
64
#elif defined(__POWER9_VECTOR__)
65
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
    /* the inline asm below is about 12% faster than the lookup method */
68
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
71
    static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72
        float f;
73
        double d;
74
        __asm__(
75
            "mtfprd %0,%2\n"
76
            "xscvhpdp %0,%0\n"
77
            "frsp %1,%0\n" :
78
            /* temp */ "=d"(d),
79
            /* out */  "=f"(f):
80
            /* in */   "r"(h));
81
        return f;
82
    }
83
84
    static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
        double d;
86
        ggml_fp16_t r;
87
        __asm__( /* xscvdphp can work on double or single precision */
88
            "xscvdphp %0,%2\n"
89
            "mffprd %1,%0\n" :
90
            /* temp */ "=d"(d),
91
            /* out */  "=r"(r):
92
            /* in */   "f"(f));
93
        return r;
94
    }
95
#elif defined(__riscv) && defined(__riscv_zfhmin)
96
    static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
        _Float16 hf;
98
        memcpy(&hf, &h, sizeof(ggml_fp16_t));
99
        return hf;
100
    }
101
102
    static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
103
        ggml_fp16_t res;
104
        _Float16 hf = (_Float16)f;
105
        memcpy(&res, &hf, sizeof(ggml_fp16_t));
106
        return res;
107
    }
108
109
    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
110
    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
111
    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
112
    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
113
#endif
114
115
// precomputed f32 table for f16 (256 KB)
116
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
117
extern float ggml_table_f32_f16[1 << 16];
118
119
// precomputed f32 table for e8m0 half (1 KB)
120
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
121
extern float ggml_table_f32_e8m0_half[1 << 8];
122
123
// Use lookup table for E8M0 on x86 (faster than bit manipulation)
124
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
125
0
#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
126
#else
127
#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
128
#endif
129
130
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
131
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
132
// This is also true for POWER9.
133
#if !defined(GGML_CPU_FP16_TO_FP32)
134
0
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
135
0
    uint16_t s;
136
0
    memcpy(&s, &f, sizeof(uint16_t));
137
0
    return ggml_table_f32_f16[s];
138
0
}
Unexecuted instantiation: repack.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: ggml-cpu.c:ggml_lookup_fp16_to_fp32
Unexecuted instantiation: quants.c:ggml_lookup_fp16_to_fp32
Unexecuted instantiation: binary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: unary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: vec.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
Unexecuted instantiation: sgemm.cpp:ggml_lookup_fp16_to_fp32(unsigned short)
139
140
0
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
141
#endif
142
143
#if !defined(GGML_CPU_FP32_TO_FP16)
144
131k
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
145
#endif
146
147
148
// we define a common set of C macros which map to specific intrinsics based on the current architecture
149
// we then implement the fundamental computation operations below using only these macros
150
// adding support for new architectures requires to define the corresponding SIMD macros
151
//
152
// GGML_F32_STEP / GGML_F16_STEP
153
//   number of elements to process in a single step
154
//
155
// GGML_F32_EPR / GGML_F16_EPR
156
//   number of elements to fit in a single register
157
//
158
159
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
160
161
#define GGML_SIMD
162
163
// F32 SVE
164
#define GGML_F32_EPR 8
165
#define DEFAULT_PG svptrue_b32()
166
167
#define GGML_F32xt                        svfloat32_t
168
#define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
169
#define GGML_F32xt_SET1(x)                svdup_n_f32(x)
170
#define GGML_F32xt_LOAD_IMPL(pg, a)       svld1_f32(pg, a)
171
#define GGML_F32xt_LOAD(a)                GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
172
#define GGML_F32xt_STORE_IMPL(pg, a, b)   svst1_f32(pg, a, b)
173
#define GGML_F32xt_STORE(a, b)            GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
174
#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
175
#define GGML_F32xt_FMA(a, b, c)           GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
176
#define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
177
#define GGML_F32xt_ADD(a, b)              GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
178
#define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
179
#define GGML_F32xt_MUL(a, b)              GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
180
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
181
#define GGML_F32xt_REDUCE_ONE(a)          GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
182
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
183
{                                                      \
184
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
185
    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
186
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
187
    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
188
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
189
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
190
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
191
    (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
192
}
193
#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
194
        GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
195
196
#define GGML_F32_VEC        GGML_F32xt
197
#define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
198
#define GGML_F32_VEC_SET1   GGML_F32xt_SET1
199
#define GGML_F32_VEC_LOAD   GGML_F32xt_LOAD
200
#define GGML_F32_VEC_STORE  GGML_F32xt_STORE
201
#define GGML_F32_VEC_FMA    GGML_F32xt_FMA
202
#define GGML_F32_VEC_ADD    GGML_F32xt_ADD
203
#define GGML_F32_VEC_MUL    GGML_F32xt_MUL
204
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
205
206
// F16 SVE
207
#define DEFAULT_PG32    svptrue_b32()
208
#define DEFAULT_PG16    svptrue_b16()
209
210
#define GGML_F32Cxt                         svfloat16_t
211
#define GGML_F32Cxt_ZERO                    svdup_n_f16(0.0f)
212
#define GGML_F32Cxt_SET1(x)                 svdup_n_f16(x)
213
#define GGML_F32Cxt_LOAD(p)                 svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
214
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
215
216
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c)   svmad_f16_x(pg, b, c, a)
217
#define GGML_F32Cxt_FMA(a, b, c)            GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
218
#define GGML_F32Cxt_ADD_IMPL(pg, a, b)      svadd_f16_x(pg, a, b)
219
#define GGML_F32Cxt_ADD(a, b)               GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
220
#define GGML_F32Cxt_MUL_IMPL(pg, a, b)      svmul_f16_x(pg, a, b)
221
#define GGML_F32Cxt_MUL(a, b)               GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
222
#define GGML_F32Cxt_REDUCE                  GGML_F16xt_REDUCE_MIXED
223
224
#define GGML_F16x_VEC                GGML_F32Cxt
225
#define GGML_F16x_VEC_ZERO           GGML_F32Cxt_ZERO
226
#define GGML_F16x_VEC_SET1           GGML_F32Cxt_SET1
227
#define GGML_F16x_VEC_LOAD(p, i)     GGML_F32Cxt_LOAD(p)
228
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
229
#define GGML_F16x_VEC_FMA            GGML_F32Cxt_FMA
230
#define GGML_F16x_VEC_ADD            GGML_F32Cxt_ADD
231
#define GGML_F16x_VEC_MUL            GGML_F32Cxt_MUL
232
#define GGML_F16x_VEC_REDUCE         GGML_F32Cxt_REDUCE
233
234
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
235
#define GGML_F16xt_REDUCE_ONE(a)          GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
236
237
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4)  \
238
{                                                      \
239
    sum1 = svadd_f16_x(pg16, sum1, sum2);              \
240
    sum3 = svadd_f16_x(pg16, sum3, sum4);              \
241
    sum1 = svadd_f16_x(pg16, sum1, sum3);              \
242
    __fp16 sum_f16 = svaddv_f16(pg16, sum1);           \
243
    (res) = (ggml_float) sum_f16;                      \
244
}
245
#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4)  \
246
        GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
247
248
// F16 NEON
249
250
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
251
    #define GGML_F16_STEP 32
252
    #define GGML_F16_EPR  8
253
254
    #define GGML_F16x8              float16x8_t
255
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
256
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
257
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
258
    #define GGML_F16x8_STORE        vst1q_f16
259
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
260
    #define GGML_F16x8_ADD          vaddq_f16
261
    #define GGML_F16x8_MUL          vmulq_f16
262
    #define GGML_F16x8_REDUCE(res, x)                               \
263
    do {                                                            \
264
        int offset = GGML_F16_ARR >> 1;                             \
265
        for (int i = 0; i < offset; ++i) {                          \
266
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
267
        }                                                           \
268
        offset >>= 1;                                               \
269
        for (int i = 0; i < offset; ++i) {                          \
270
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
271
        }                                                           \
272
        offset >>= 1;                                               \
273
        for (int i = 0; i < offset; ++i) {                          \
274
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
275
        }                                                           \
276
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
277
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
278
        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
279
    } while (0)
280
281
    #define GGML_F16_VEC                GGML_F16x8
282
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
283
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
284
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
285
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
286
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
287
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
288
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
289
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
290
#else
291
    // if FP16 vector arithmetic is not supported, we use FP32 instead
292
    // and take advantage of the vcvt_ functions to convert to/from FP16
293
294
    #define GGML_F16_STEP 16
295
    #define GGML_F16_EPR  4
296
297
    #define GGML_F32Cx4              float32x4_t
298
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
299
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
300
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
301
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
302
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
303
    #define GGML_F32Cx4_ADD          vaddq_f32
304
    #define GGML_F32Cx4_MUL          vmulq_f32
305
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
306
307
    #define GGML_F16_VEC                GGML_F32Cx4
308
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
309
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
310
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
311
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
312
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
313
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
314
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
315
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
316
#endif
317
318
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
319
320
#define GGML_SIMD
321
322
// F32 NEON
323
324
#define GGML_F32_STEP 16
325
#define GGML_F32_EPR  4
326
327
#define GGML_F32x4              float32x4_t
328
#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
329
#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
330
#define GGML_F32x4_LOAD         vld1q_f32
331
#define GGML_F32x4_STORE        vst1q_f32
332
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
333
#define GGML_F32x4_ADD          vaddq_f32
334
#define GGML_F32x4_MUL          vmulq_f32
335
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
336
#define GGML_F32x4_REDUCE(res, x)                       \
337
{                                                       \
338
    int offset = GGML_F32_ARR >> 1;                     \
339
    for (int i = 0; i < offset; ++i) {                  \
340
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
341
    }                                                   \
342
    offset >>= 1;                                       \
343
    for (int i = 0; i < offset; ++i) {                  \
344
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
345
    }                                                   \
346
    offset >>= 1;                                       \
347
    for (int i = 0; i < offset; ++i) {                  \
348
        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
349
    }                                                   \
350
    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
351
}
352
353
#define GGML_F32_VEC        GGML_F32x4
354
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
355
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
356
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
357
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
358
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
359
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
360
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
361
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
362
363
// F16 NEON
364
365
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
366
    #define GGML_F16_STEP 32
367
    #define GGML_F16_EPR  8
368
369
    #define GGML_F16x8              float16x8_t
370
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
371
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
372
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
373
    #define GGML_F16x8_STORE        vst1q_f16
374
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
375
    #define GGML_F16x8_ADD          vaddq_f16
376
    #define GGML_F16x8_MUL          vmulq_f16
377
    #define GGML_F16x8_REDUCE(res, x)                               \
378
    do {                                                            \
379
        int offset = GGML_F16_ARR >> 1;                             \
380
        for (int i = 0; i < offset; ++i) {                          \
381
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
382
        }                                                           \
383
        offset >>= 1;                                               \
384
        for (int i = 0; i < offset; ++i) {                          \
385
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
386
        }                                                           \
387
        offset >>= 1;                                               \
388
        for (int i = 0; i < offset; ++i) {                          \
389
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
390
        }                                                           \
391
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
392
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
393
        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
394
    } while (0)
395
396
    #define GGML_F16_VEC                GGML_F16x8
397
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
398
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
399
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
400
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
401
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
402
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
403
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
404
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
405
#else
406
    // if FP16 vector arithmetic is not supported, we use FP32 instead
407
    // and take advantage of the vcvt_ functions to convert to/from FP16
408
409
    #define GGML_F16_STEP 16
410
    #define GGML_F16_EPR  4
411
412
    #define GGML_F32Cx4              float32x4_t
413
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
414
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
415
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
416
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
417
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
418
    #define GGML_F32Cx4_ADD          vaddq_f32
419
    #define GGML_F32Cx4_MUL          vmulq_f32
420
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
421
422
    #define GGML_F16_VEC                GGML_F32Cx4
423
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
424
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
425
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
426
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
427
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
428
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
429
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
430
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
431
#endif
432
433
#elif defined(__AVX512F__)
434
435
#define GGML_SIMD
436
437
// F32 AVX512
438
439
#define GGML_F32_STEP 64
440
#define GGML_F32_EPR  16
441
442
#define GGML_F32x16         __m512
443
#define GGML_F32x16_ZERO    _mm512_setzero_ps()
444
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
445
#define GGML_F32x16_LOAD    _mm512_loadu_ps
446
#define GGML_F32x16_STORE   _mm512_storeu_ps
447
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
448
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
449
#define GGML_F32x16_ADD     _mm512_add_ps
450
#define GGML_F32x16_MUL     _mm512_mul_ps
451
#define GGML_F32x16_REDUCE(res, x)                                    \
452
do {                                                                  \
453
    int offset = GGML_F32_ARR >> 1;                                   \
454
    for (int i = 0; i < offset; ++i) {                                \
455
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
456
    }                                                                 \
457
    offset >>= 1;                                                     \
458
    for (int i = 0; i < offset; ++i) {                                \
459
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
460
    }                                                                 \
461
    offset >>= 1;                                                     \
462
    for (int i = 0; i < offset; ++i) {                                \
463
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
464
    }                                                                 \
465
    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
466
} while (0)
467
468
// TODO: is this optimal ?
469
470
#define GGML_F32_VEC        GGML_F32x16
471
#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
472
#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
473
#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
474
#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
475
#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
476
#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
477
#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
478
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
479
480
// F16 AVX512
481
482
#if defined(__AVX512FP16__)
483
484
#define GGML_F16_STEP 128
485
#define GGML_F16_EPR  32
486
487
#define GGML_F16x32              __m512h
488
#define GGML_F16x32_ZERO         _mm512_setzero_ph()
489
#define GGML_F16x32_SET1(x)      _mm512_set1_ph(__extension__(_Float16)(x))
490
#define GGML_F16x32_LOAD(x)      _mm512_loadu_ph(x)
491
#define GGML_F16x32_STORE(x, y)  _mm512_storeu_ph(x, y)
492
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
493
#define GGML_F16x32_ADD          _mm512_add_ph
494
#define GGML_F16x32_MUL          _mm512_mul_ph
495
#define GGML_F16x32_REDUCE(res, x)                                     \
496
do {                                                                   \
497
    int offset = GGML_F16_ARR >> 1;                                    \
498
    for (int i = 0; i < offset; ++i) {                                 \
499
        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
500
    }                                                                  \
501
    offset >>= 1;                                                      \
502
    for (int i = 0; i < offset; ++i) {                                 \
503
        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
504
    }                                                                  \
505
    offset >>= 1;                                                      \
506
    for (int i = 0; i < offset; ++i) {                                 \
507
        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
508
    }                                                                  \
509
    res = (ggml_float) _mm512_reduce_add_ph(x[0]);                     \
510
} while (0)
511
512
#define GGML_F16_VEC                GGML_F16x32
513
#define GGML_F16_VEC_ZERO           GGML_F16x32_ZERO
514
#define GGML_F16_VEC_SET1           GGML_F16x32_SET1
515
#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x32_LOAD(p)
516
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
517
#define GGML_F16_VEC_FMA            GGML_F16x32_FMA
518
#define GGML_F16_VEC_ADD            GGML_F16x32_ADD
519
#define GGML_F16_VEC_MUL            GGML_F16x32_MUL
520
#define GGML_F16_VEC_REDUCE         GGML_F16x32_REDUCE
521
522
#else // Fallback FP16 <-> FP32
523
524
#define GGML_F16_STEP 64
525
#define GGML_F16_EPR  16
526
527
#define GGML_F32Cx16             __m512
528
#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
529
#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
530
531
// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
532
// so F16C guard isn't required
533
#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
534
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
535
536
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
537
#define GGML_F32Cx16_ADD         _mm512_add_ps
538
#define GGML_F32Cx16_MUL         _mm512_mul_ps
539
#define GGML_F32Cx16_REDUCE(res, x)                               \
540
do {                                                              \
541
    int offset = GGML_F32_ARR >> 1;                               \
542
    for (int i = 0; i < offset; ++i) {                            \
543
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
544
    }                                                             \
545
    offset >>= 1;                                                 \
546
    for (int i = 0; i < offset; ++i) {                            \
547
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
548
    }                                                             \
549
    offset >>= 1;                                                 \
550
    for (int i = 0; i < offset; ++i) {                            \
551
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
552
    }                                                             \
553
    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
554
} while (0)
555
556
#define GGML_F16_VEC                GGML_F32Cx16
557
#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
558
#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
559
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
560
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
561
#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
562
#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
563
#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
564
565
#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
566
567
#endif // __AVX512FP16__
568
#elif defined(__AVX__)
569
570
#define GGML_SIMD
571
572
// F32 AVX
573
574
0
#define GGML_F32_STEP 32
575
0
#define GGML_F32_EPR  8
576
577
0
#define GGML_F32x8         __m256
578
0
#define GGML_F32x8_ZERO    _mm256_setzero_ps()
579
0
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
580
0
#define GGML_F32x8_LOAD    _mm256_loadu_ps
581
0
#define GGML_F32x8_STORE   _mm256_storeu_ps
582
#if defined(__FMA__)
583
0
    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
584
#else
585
    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
586
#endif
587
0
#define GGML_F32x8_ADD     _mm256_add_ps
588
0
#define GGML_F32x8_MUL     _mm256_mul_ps
589
0
#define GGML_F32x8_REDUCE(res, x)                                 \
590
0
do {                                                              \
591
0
    int offset = GGML_F32_ARR >> 1;                               \
592
0
    for (int i = 0; i < offset; ++i) {                            \
593
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
594
0
    }                                                             \
595
0
    offset >>= 1;                                                 \
596
0
    for (int i = 0; i < offset; ++i) {                            \
597
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
598
0
    }                                                             \
599
0
    offset >>= 1;                                                 \
600
0
    for (int i = 0; i < offset; ++i) {                            \
601
0
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
602
0
    }                                                             \
603
0
    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
604
0
                                 _mm256_extractf128_ps(x[0], 1)); \
605
0
    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
606
0
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
607
0
} while (0)
608
// TODO: is this optimal ?
609
610
0
#define GGML_F32_VEC        GGML_F32x8
611
0
#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
612
0
#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
613
0
#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
614
0
#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
615
0
#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
616
0
#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
617
0
#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
618
0
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
619
620
// F16 AVX
621
622
0
#define GGML_F16_STEP 32
623
0
#define GGML_F16_EPR  8
624
625
// F16 arithmetic is not supported by AVX, so we use F32 instead
626
627
0
#define GGML_F32Cx8             __m256
628
0
#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
629
0
#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
630
631
#if defined(__F16C__)
632
// the  _mm256_cvt intrinsics require F16C
633
0
#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
634
0
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
635
#else
636
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
637
    float tmp[8];
638
639
    for (int i = 0; i < 8; i++) {
640
        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
641
    }
642
643
    return _mm256_loadu_ps(tmp);
644
}
645
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
646
    float arr[8];
647
648
    _mm256_storeu_ps(arr, y);
649
650
    for (int i = 0; i < 8; i++)
651
        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
652
}
653
#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
654
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
655
#endif
656
657
0
#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
658
#define GGML_F32Cx8_ADD         _mm256_add_ps
659
0
#define GGML_F32Cx8_MUL         _mm256_mul_ps
660
0
#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
661
662
0
#define GGML_F16_VEC                GGML_F32Cx8
663
0
#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
664
0
#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
665
0
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
666
0
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
667
0
#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
668
#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
669
0
#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
670
0
#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
671
672
#elif defined(__POWER9_VECTOR__)
673
674
#define GGML_SIMD
675
676
// F32 POWER9
677
678
#define GGML_F32_STEP 32
679
#define GGML_F32_EPR  4
680
681
#define GGML_F32x4              vector float
682
#define GGML_F32x4_ZERO         {0.0f}
683
#define GGML_F32x4_SET1         vec_splats
684
#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
685
#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
686
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
687
#define GGML_F32x4_ADD          vec_add
688
#define GGML_F32x4_MUL          vec_mul
689
#define GGML_F32x4_REDUCE(res, x)              \
690
{                                              \
691
    int offset = GGML_F32_ARR >> 1;            \
692
    for (int i = 0; i < offset; ++i) {         \
693
        x[i] = vec_add(x[i], x[offset+i]);     \
694
    }                                          \
695
    offset >>= 1;                              \
696
    for (int i = 0; i < offset; ++i) {         \
697
        x[i] = vec_add(x[i], x[offset+i]);     \
698
    }                                          \
699
    offset >>= 1;                              \
700
    for (int i = 0; i < offset; ++i) {         \
701
        x[i] = vec_add(x[i], x[offset+i]);     \
702
    }                                          \
703
    res = vec_extract(x[0], 0) +               \
704
          vec_extract(x[0], 1) +               \
705
          vec_extract(x[0], 2) +               \
706
          vec_extract(x[0], 3);                \
707
}
708
#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3)        \
709
{                                                       \
710
    vector float v = vec_add(vec_add(s0, s1),           \
711
                             vec_add(s2, s3));          \
712
    v = vec_add(v, vec_sld(v, v, 8));                   \
713
    v = vec_add(v, vec_sld(v, v, 4));                   \
714
    res += (ggml_float) vec_extract(v, 0);              \
715
}
716
717
#define GGML_F32_VEC        GGML_F32x4
718
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
719
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
720
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
721
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
722
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
723
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
724
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
725
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
726
727
// F16 POWER9
728
#define GGML_F16_STEP       GGML_F32_STEP
729
#define GGML_F16_EPR        GGML_F32_EPR
730
#define GGML_F16_VEC        GGML_F32x4
731
#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
732
#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
733
#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
734
#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
735
#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
736
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
737
// Use vec_xl, not vec_ld, in case the load address is not aligned.
738
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
739
  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
740
  vec_extract_fp32_from_shortl(vec_xl(0, p))
741
static inline unsigned char ggml_endian_byte(int i) {
742
       uint16_t tmp_val = 1;
743
       return ((unsigned char *)&tmp_val)[i];
744
}
745
#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
746
#define GGML_F16_VEC_STORE(p, r, i)                             \
747
  if (i & 0x1)                                                  \
748
    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
749
                                   r[i - GGML_ENDIAN_BYTE(0)]), \
750
            0, p - GGML_F16_EPR)
751
752
//BF16 POWER9
753
#define GGML_BF16_STEP 16
754
#define GGML_BF16_EPR  8
755
756
#define GGML_BF16x8         vector unsigned short
757
#define GGML_BF16x8_ZERO    vec_splats((unsigned short)0)
758
#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
759
760
#define GGML_BF16_VEC          GGML_BF16x8
761
#define GGML_BF16_VEC_ZERO     GGML_BF16x8_ZERO
762
#define GGML_BF16_VEC_LOAD     GGML_BF16x8_LOAD
763
#if defined(__LITTLE_ENDIAN__)
764
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
765
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
766
#else
767
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
768
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
769
#endif
770
#define GGML_BF16_FMA_LO(acc, x, y) \
771
    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
772
#define GGML_BF16_FMA_HI(acc, x, y) \
773
    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
774
775
#elif defined(__wasm_simd128__)
776
777
#define GGML_SIMD
778
779
// F32 WASM
780
781
#define GGML_F32_STEP 16
782
#define GGML_F32_EPR  4
783
784
#define GGML_F32x4              v128_t
785
#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
786
#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
787
#define GGML_F32x4_LOAD         wasm_v128_load
788
#define GGML_F32x4_STORE        wasm_v128_store
789
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
790
#define GGML_F32x4_ADD          wasm_f32x4_add
791
#define GGML_F32x4_MUL          wasm_f32x4_mul
792
#define GGML_F32x4_REDUCE(res, x)                  \
793
{                                                  \
794
    int offset = GGML_F32_ARR >> 1;                \
795
    for (int i = 0; i < offset; ++i) {             \
796
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
797
    }                                              \
798
    offset >>= 1;                                  \
799
    for (int i = 0; i < offset; ++i) {             \
800
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
801
    }                                              \
802
    offset >>= 1;                                  \
803
    for (int i = 0; i < offset; ++i) {             \
804
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
805
    }                                              \
806
    res = wasm_f32x4_extract_lane(x[0], 0) +       \
807
          wasm_f32x4_extract_lane(x[0], 1) +       \
808
          wasm_f32x4_extract_lane(x[0], 2) +       \
809
          wasm_f32x4_extract_lane(x[0], 3);        \
810
}
811
812
#define GGML_F32_VEC        GGML_F32x4
813
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
814
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
815
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
816
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
817
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
818
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
819
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
820
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
821
822
// F16 WASM
823
824
#define GGML_F16_STEP 16
825
#define GGML_F16_EPR  4
826
827
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
828
    float tmp[4];
829
830
    tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
831
    tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
832
    tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
833
    tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
834
835
    return wasm_v128_load(tmp);
836
}
837
838
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
839
    float tmp[4];
840
841
    wasm_v128_store(tmp, x);
842
843
    p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
844
    p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
845
    p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
846
    p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
847
}
848
849
#define GGML_F16x4             v128_t
850
#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
851
#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
852
#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
853
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
854
#define GGML_F16x4_FMA         GGML_F32x4_FMA
855
#define GGML_F16x4_ADD         wasm_f32x4_add
856
#define GGML_F16x4_MUL         wasm_f32x4_mul
857
#define GGML_F16x4_REDUCE(res, x)                           \
858
{                                                           \
859
    int offset = GGML_F16_ARR >> 1;                         \
860
    for (int i = 0; i < offset; ++i) {                      \
861
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
862
    }                                                       \
863
    offset >>= 1;                                           \
864
    for (int i = 0; i < offset; ++i) {                      \
865
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
866
    }                                                       \
867
    offset >>= 1;                                           \
868
    for (int i = 0; i < offset; ++i) {                      \
869
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
870
    }                                                       \
871
    res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) +  \
872
          wasm_f32x4_extract_lane(x[0], 1) +                \
873
          wasm_f32x4_extract_lane(x[0], 2) +                \
874
          wasm_f32x4_extract_lane(x[0], 3));                \
875
}
876
877
#define GGML_F16_VEC                GGML_F16x4
878
#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
879
#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
880
#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
881
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
882
#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
883
#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
884
#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
885
#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
886
887
#elif defined(__SSE3__)
888
889
#define GGML_SIMD
890
891
// F32 SSE
892
893
#define GGML_F32_STEP 32
894
#define GGML_F32_EPR  4
895
896
#define GGML_F32x4         __m128
897
#define GGML_F32x4_ZERO    _mm_setzero_ps()
898
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
899
#define GGML_F32x4_LOAD    _mm_loadu_ps
900
#define GGML_F32x4_STORE   _mm_storeu_ps
901
#if defined(__FMA__)
902
    // TODO: Does this work?
903
    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
904
#else
905
    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
906
#endif
907
#define GGML_F32x4_ADD     _mm_add_ps
908
#define GGML_F32x4_MUL     _mm_mul_ps
909
#define GGML_F32x4_REDUCE(res, x)                                 \
910
{                                                                 \
911
    int offset = GGML_F32_ARR >> 1;                               \
912
    for (int i = 0; i < offset; ++i) {                            \
913
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
914
    }                                                             \
915
    offset >>= 1;                                                 \
916
    for (int i = 0; i < offset; ++i) {                            \
917
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
918
    }                                                             \
919
    offset >>= 1;                                                 \
920
    for (int i = 0; i < offset; ++i) {                            \
921
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
922
    }                                                             \
923
    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
924
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
925
}
926
// TODO: is this optimal ?
927
928
#define GGML_F32_VEC        GGML_F32x4
929
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
930
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
931
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
932
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
933
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
934
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
935
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
936
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
937
938
// F16 SSE
939
940
#define GGML_F16_STEP 32
941
#define GGML_F16_EPR  4
942
943
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
944
    float tmp[4];
945
946
    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
947
    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
948
    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
949
    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
950
951
    return _mm_loadu_ps(tmp);
952
}
953
954
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
955
    float arr[4];
956
957
    _mm_storeu_ps(arr, y);
958
959
    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
960
    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
961
    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
962
    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
963
}
964
965
#define GGML_F32Cx4             __m128
966
#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
967
#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
968
#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
969
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
970
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
971
#define GGML_F32Cx4_ADD         _mm_add_ps
972
#define GGML_F32Cx4_MUL         _mm_mul_ps
973
#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
974
975
#define GGML_F16_VEC                 GGML_F32Cx4
976
#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
977
#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
978
#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
979
#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
980
#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
981
#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
982
#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
983
#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
984
985
#elif defined(__loongarch_asx)
986
987
#define GGML_SIMD
988
989
// F32 LASX
990
#define GGML_F32_STEP 32
991
#define GGML_F32_EPR  8
992
993
#define GGML_F32x8         __m256
994
#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
995
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
996
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
997
#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
998
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
999
#define GGML_F32x8_ADD     __lasx_xvfadd_s
1000
#define GGML_F32x8_MUL     __lasx_xvfmul_s
1001
#define GGML_F32x8_REDUCE(res, x)                                 \
1002
do {                                                              \
1003
    int offset = GGML_F32_ARR >> 1;                               \
1004
    for (int i = 0; i < offset; ++i) {                            \
1005
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
1006
    }                                                             \
1007
    offset >>= 1;                                                 \
1008
    for (int i = 0; i < offset; ++i) {                            \
1009
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
1010
    }                                                             \
1011
    offset >>= 1;                                                 \
1012
    for (int i = 0; i < offset; ++i) {                            \
1013
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
1014
    }                                                             \
1015
    float *tmp_p = (float *)&x[0]; \
1016
    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
1017
} while (0)
1018
// TODO: is this optimal ?
1019
1020
#define GGML_F32_VEC        GGML_F32x8
1021
#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
1022
#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
1023
#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
1024
#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
1025
#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
1026
#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
1027
#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
1028
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
1029
1030
// F16 LASX
1031
1032
#define GGML_F16_STEP 32
1033
#define GGML_F16_EPR  8
1034
1035
// F16 arithmetic is not supported by LASX, so we use F32 instead
1036
1037
#define GGML_F32Cx8          __m256
1038
#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
1039
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
1040
1041
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1042
    __m256i a;
1043
    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1044
    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1045
    return __lasx_xvfcvtl_s_h(a);
1046
}
1047
1048
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1049
    __m256i a = __lasx_xvfcvt_h_s(y, y);
1050
    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1051
    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1052
}
1053
#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
1054
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
1055
1056
#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
1057
#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
1058
#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
1059
#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
1060
1061
#define GGML_F16_VEC                GGML_F32Cx8
1062
#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
1063
#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
1064
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
1065
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
1066
#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
1067
#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
1068
#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
1069
#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
1070
1071
#elif defined(__loongarch_sx)
1072
1073
#define GGML_SIMD
1074
1075
// F32 LSX
1076
1077
#define GGML_F32_STEP 32
1078
#define GGML_F32_EPR  4
1079
1080
#define GGML_F32x4         __m128
1081
#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
1082
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1083
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1084
#define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
1085
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1086
#define GGML_F32x4_ADD     __lsx_vfadd_s
1087
#define GGML_F32x4_MUL     __lsx_vfmul_s
1088
1089
#define GGML_F32x4_REDUCE(res, x)                               \
1090
{                                                               \
1091
    int offset = GGML_F32_ARR >> 1;                             \
1092
    for (int i = 0; i < offset; ++i) {                          \
1093
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1094
    }                                                           \
1095
    offset >>= 1;                                               \
1096
    for (int i = 0; i < offset; ++i) {                          \
1097
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1098
    }                                                           \
1099
    offset >>= 1;                                               \
1100
    for (int i = 0; i < offset; ++i) {                          \
1101
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1102
    }                                                           \
1103
    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1104
    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1105
    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
1106
    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
1107
    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
1108
    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
1109
    res = (ggml_float) ((v4f32)t5)[0];                          \
1110
}
1111
1112
#define GGML_F32_VEC        GGML_F32x4
1113
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1114
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1115
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1116
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1117
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1118
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1119
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1120
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1121
1122
// F16 LSX
1123
1124
#define GGML_F16_STEP 32
1125
#define GGML_F16_EPR  4
1126
1127
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1128
    float tmp[4];
1129
1130
    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1131
    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1132
    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1133
    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1134
1135
    return (__m128)__lsx_vld(tmp, 0);
1136
}
1137
1138
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1139
    float arr[4];
1140
1141
    __lsx_vst(y, arr, 0);
1142
1143
    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1144
    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1145
    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1146
    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1147
}
1148
1149
#define GGML_F32Cx4             __m128
1150
#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
1151
#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
1152
#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
1153
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1154
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
1155
#define GGML_F32Cx4_ADD         __lsx_vfadd_s
1156
#define GGML_F32Cx4_MUL         __lsx_vfmul_s
1157
#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
1158
1159
#define GGML_F16_VEC                 GGML_F32Cx4
1160
#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
1161
#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
1162
#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
1163
#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
1164
#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
1165
#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
1166
#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
1167
#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
1168
1169
#elif defined(__VXE__) || defined(__VXE2__)
1170
1171
#define GGML_SIMD
1172
1173
// F32 s390x
1174
1175
#define GGML_F32_STEP 32
1176
#define GGML_F32_EPR  4
1177
1178
#define GGML_F32x4              float32x4_t
1179
#define GGML_F32x4_ZERO         vec_splats(0.0f)
1180
#define GGML_F32x4_SET1         vec_splats
1181
#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
1182
#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
1183
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1184
#define GGML_F32x4_ADD          vec_add
1185
#define GGML_F32x4_MUL          vec_mul
1186
#define GGML_F32x4_REDUCE(res, x)                   \
1187
{                                                   \
1188
    int offset = GGML_F32_ARR >> 1;                 \
1189
    for (int i = 0; i < offset; ++i) {              \
1190
        x[i] = vec_add(x[i], x[offset + i]);        \
1191
    }                                               \
1192
    offset >>= 1;                                   \
1193
    for (int i = 0; i < offset; ++i) {              \
1194
        x[i] = vec_add(x[i], x[offset + i]);        \
1195
    }                                               \
1196
    offset >>= 1;                                   \
1197
    for (int i = 0; i < offset; ++i) {              \
1198
        x[i] = vec_add(x[i], x[offset + i]);        \
1199
    }                                               \
1200
    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
1201
    res = tmp[0] + tmp[1];                          \
1202
}
1203
#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
1204
{                                                \
1205
    float32x4_t v = vec_add(vec_add(s0, s1),     \
1206
                            vec_add(s2, s3));    \
1207
    v = vec_add(v, vec_sld(v, v, 8));            \
1208
    v = vec_add(v, vec_sld(v, v, 4));            \
1209
    res += (ggml_float)vec_extract(v, 0);        \
1210
}
1211
1212
#define GGML_F32_VEC        GGML_F32x4
1213
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1214
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1215
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1216
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1217
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1218
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1219
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1220
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1221
1222
// F16 s390x
1223
#define GGML_F16_STEP GGML_F32_STEP
1224
#define GGML_F16_EPR  GGML_F32_EPR
1225
1226
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1227
    float tmp[4];
1228
1229
    for (int i = 0; i < 4; i++) {
1230
        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1231
    }
1232
1233
    // note: keep type-cast here to prevent compiler bugs
1234
    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1235
    return vec_xl(0, (const float *)(tmp));
1236
}
1237
1238
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1239
    float arr[4];
1240
1241
    // note: keep type-cast here to prevent compiler bugs
1242
    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1243
    vec_xst(v_y, 0, (float *)(arr));
1244
1245
    for (int i = 0; i < 4; i++) {
1246
        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1247
    }
1248
}
1249
1250
#define GGML_F16_VEC                GGML_F32x4
1251
#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
1252
#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
1253
#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
1254
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1255
#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
1256
#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
1257
#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
1258
#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
1259
1260
// BF16 s390x
1261
#define GGML_BF16_STEP 16
1262
#define GGML_BF16_EPR  8
1263
1264
#define GGML_BF16x8         __vector unsigned short
1265
#define GGML_BF16x8_ZERO    vec_splats((unsigned short)0)
1266
#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
1267
1268
#define GGML_BF16_VEC      GGML_BF16x8
1269
#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
1270
#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
1271
#define GGML_BF16_TO_F32_LO(v) ((float32x4_t) vec_mergel((v), GGML_BF16_VEC_ZERO))
1272
#define GGML_BF16_TO_F32_HI(v) ((float32x4_t) vec_mergeh((v), GGML_BF16_VEC_ZERO))
1273
#define GGML_BF16_FMA_LO(acc, x, y) \
1274
    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
1275
#define GGML_BF16_FMA_HI(acc, x, y) \
1276
    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
1277
1278
#elif defined(__riscv_v_intrinsic)
1279
1280
// compatible with vlen >= 128
1281
1282
#define GGML_SIMD
1283
1284
// F32
1285
1286
#define GGML_F32_STEP 16
1287
#define GGML_F32_EPR  4
1288
1289
#define GGML_F32x4              vfloat32m1_t
1290
#define GGML_F32x4_ZERO         __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1291
#define GGML_F32x4_SET1(x)      __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1292
#define GGML_F32x4_LOAD(x)      __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1293
#define GGML_F32x4_STORE(b, v)  __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1294
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1295
#define GGML_F32x4_ADD(a, b)    __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1296
#define GGML_F32x4_MUL(a, b)    __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1297
1298
#define GGML_F32_VEC        GGML_F32x4
1299
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1300
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1301
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1302
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1303
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1304
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1305
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1306
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1307
1308
#endif
1309
1310
// GGML_F32_ARR / GGML_F16_ARR
1311
//   number of registers to use per step
1312
#ifdef GGML_SIMD
1313
0
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1314
0
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1315
#endif
1316
1317
#ifdef __cplusplus
1318
}
1319
#endif