Coverage Report

Created: 2025-12-28 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp
Line
Count
Source
1
#include "vec.h"
2
3
#include <cassert>
4
5
// precomputed gelu table for f16 (128 KB)
6
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
7
8
// precomputed quick gelu table for f16 (128 KB)
9
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
10
11
0
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
12
0
   assert(nrc == 1);
13
0
   GGML_UNUSED(nrc);
14
0
   GGML_UNUSED(bx);
15
0
   GGML_UNUSED(by);
16
0
   GGML_UNUSED(bs);
17
18
0
#if defined(GGML_SIMD)
19
0
    float sumf = 0.0f;
20
21
    #if defined(__ARM_FEATURE_SVE)
22
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
23
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
24
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
25
26
        const int np = (n & ~(ggml_f32_step - 1));
27
        svfloat32_t sum1 = svdup_n_f32(0.0f);
28
        svfloat32_t sum2 = svdup_n_f32(0.0f);
29
        svfloat32_t sum3 = svdup_n_f32(0.0f);
30
        svfloat32_t sum4 = svdup_n_f32(0.0f);
31
        svfloat32_t sum5 = svdup_n_f32(0.0f);
32
        svfloat32_t sum6 = svdup_n_f32(0.0f);
33
        svfloat32_t sum7 = svdup_n_f32(0.0f);
34
        svfloat32_t sum8 = svdup_n_f32(0.0f);
35
        svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
36
        svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
37
        for (int i = 0; i < np; i += ggml_f32_step) {
38
            ax1 = GGML_F32_VEC_LOAD(x + i);
39
            ay1 = GGML_F32_VEC_LOAD(y + i);
40
            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
41
42
            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
            sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
45
46
            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
            sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
49
50
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
            sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
53
54
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
            sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
57
58
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
            sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
61
62
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
            sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
65
66
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
            sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
69
        }
70
        // leftovers
71
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
72
        const int np2 = (n & ~(ggml_f32_epr - 1));
73
        for (int i = np; i < np2; i += ggml_f32_epr) {
74
            ax1 = GGML_F32_VEC_LOAD(x + i);
75
            ay1 = GGML_F32_VEC_LOAD(y + i);
76
            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
77
        }
78
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
        if (np2 < n) {
80
            svbool_t pg = svwhilelt_b32(np2, n);
81
            ax1 = svld1_f32(pg, x + np2);
82
            ay1 = svld1_f32(pg, y + np2);
83
            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
84
        }
85
        // reduce sum1,sum2 to sum1
86
        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
    #elif defined(__riscv_v_intrinsic)
88
        int vl = __riscv_vsetvlmax_e32m8();
89
        vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
90
        vfloat32m8_t vsum;
91
        vfloat32m8_t ax;
92
        vfloat32m8_t ay;
93
        vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
94
        for (int i = 0; i < n; i += vl) {
95
            vl = __riscv_vsetvl_e32m8(n - i);
96
            ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
97
            ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
98
            vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
99
        }
100
        vl = __riscv_vsetvlmax_e32m8();
101
        vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
102
        sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
103
    #else
104
0
        const int np = (n & ~(GGML_F32_STEP - 1));
105
106
0
        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
107
108
0
        GGML_F32_VEC ax[GGML_F32_ARR];
109
0
        GGML_F32_VEC ay[GGML_F32_ARR];
110
111
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
112
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
113
0
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
114
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
115
116
0
                sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
117
0
            }
118
0
        }
119
120
        // reduce sum0..sum3 to sum0
121
0
        GGML_F32_VEC_REDUCE(sumf, sum);
122
123
        // leftovers
124
0
        for (int i = np; i < n; ++i) {
125
0
            sumf += x[i]*y[i];
126
0
        }
127
0
    #endif
128
#else
129
    // scalar
130
    ggml_float sumf = 0.0;
131
    for (int i = 0; i < n; ++i) {
132
        sumf += (ggml_float)(x[i]*y[i]);
133
    }
134
#endif
135
136
0
    *s = sumf;
137
0
}
138
139
0
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
140
0
    assert(nrc == 1);
141
0
    GGML_UNUSED(nrc);
142
0
    GGML_UNUSED(bx);
143
0
    GGML_UNUSED(by);
144
0
    GGML_UNUSED(bs);
145
0
    int i = 0;
146
0
    ggml_float sumf = 0;
147
148
#if defined(__AVX512BF16__)
149
    __m512 c1 = _mm512_setzero_ps();
150
    __m512 c2 = _mm512_setzero_ps();
151
    for (; i + 64 <= n; i += 64) {
152
        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
153
                             m512bh(_mm512_loadu_si512((y + i))));
154
        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
155
                             m512bh(_mm512_loadu_si512((y + i + 32))));
156
    }
157
    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
158
    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
159
160
#elif defined(__AVX512F__)
161
#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
162
    __m512 c1 = _mm512_setzero_ps();
163
    __m512 c2 = _mm512_setzero_ps();
164
    for (; i + 32 <= n; i += 32) {
165
        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
166
        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
167
    }
168
    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
169
    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
170
171
#undef LOAD
172
#elif defined(__AVX2__) || defined(__AVX__)
173
#if defined(__AVX2__)
174
0
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
175
#else
176
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
177
#endif
178
0
    __m256 c1 = _mm256_setzero_ps();
179
0
    __m256 c2 = _mm256_setzero_ps();
180
0
    __m256 c3 = _mm256_setzero_ps();
181
0
    __m256 c4 = _mm256_setzero_ps();
182
0
    for (; i + 32 <= n; i += 32) {
183
0
        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
184
0
        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
185
0
        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
186
0
        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
187
0
    }
188
0
    __m128 g;
189
0
    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
190
0
                       _mm256_add_ps(c2, c4));
191
0
    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
192
0
                   _mm256_castps256_ps128(c1));
193
0
    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
194
0
    g = _mm_add_ss(g, _mm_movehdup_ps(g));
195
0
    sumf += (ggml_float)_mm_cvtss_f32(g);
196
197
0
#undef LOAD
198
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
199
    size_t vl = __riscv_vsetvlmax_e32m4();
200
201
    // initialize accumulators to all zeroes
202
    vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
203
    vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
204
205
    // calculate step size
206
    const size_t epr = __riscv_vsetvlmax_e16m2();
207
    const size_t step = epr * 2;
208
    const int np = (n & ~(step - 1));
209
210
    // unroll by 2
211
    for (; i < np; i += step) {
212
        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
213
        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
214
        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
215
        __asm__ __volatile__ ("" ::: "memory");
216
217
        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
218
        vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
219
        vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
220
        __asm__ __volatile__ ("" ::: "memory");
221
    }
222
223
    // accumulate in 1 register
224
    vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
225
226
    // leftovers
227
    for (i = np; i < n; i += vl) {
228
        vl = __riscv_vsetvl_e16m2(n - i);
229
        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
230
        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
231
        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
232
    }
233
234
    // reduce
235
    vl = __riscv_vsetvlmax_e32m4();
236
    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
237
    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
238
239
#endif
240
0
    for (; i < n; ++i) {
241
0
        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
242
0
                             GGML_BF16_TO_FP32(y[i]));
243
0
    }
244
0
    *s = sumf;
245
0
}
246
247
0
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
248
0
    assert(nrc == 1);
249
0
    GGML_UNUSED(nrc);
250
0
    GGML_UNUSED(bx);
251
0
    GGML_UNUSED(by);
252
0
    GGML_UNUSED(bs);
253
254
0
    ggml_float sumf = 0.0;
255
256
257
0
#if defined(GGML_SIMD)
258
    #if defined(__ARM_FEATURE_SVE)
259
        const int sve_register_length = svcntb() * 8; //get vector length
260
        const int ggml_f16_epr = sve_register_length / 16; // running when 16
261
        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
262
263
        const int np= (n & ~(ggml_f16_step - 1));
264
        svfloat16_t sum1 = svdup_n_f16(0.0f);
265
        svfloat16_t sum2 = svdup_n_f16(0.0f);
266
        svfloat16_t sum3 = svdup_n_f16(0.0f);
267
        svfloat16_t sum4 = svdup_n_f16(0.0f);
268
269
        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
270
        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
271
        for (int i = 0; i < np; i += ggml_f16_step) {
272
            ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
273
            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
274
            sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
275
276
            ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
277
            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
278
            sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
279
280
            ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
281
            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
282
            sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
283
284
            ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
285
            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
286
            sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
287
288
            ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
289
            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
290
            sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
291
292
            ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
293
            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
294
            sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
295
296
            ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
297
            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
298
            sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
299
300
            ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
301
            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
302
            sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
303
        }
304
305
        const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
306
        for (int k = np; k < np2; k += ggml_f16_epr) {
307
            svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
308
            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
309
            sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
310
        }
311
312
        if (np2 < n) {
313
            svbool_t pg = svwhilelt_b16(np2, n);
314
            svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
315
            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
316
317
            sum1 = svmad_f16_x(pg, hx, hy, sum1);
318
        }
319
        GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
320
    #elif defined(__riscv_v_intrinsic)
321
        #if defined(__riscv_zvfh)
322
            int vl = __riscv_vsetvlmax_e32m2();
323
            vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
324
            vfloat32m2_t vsum;
325
            vfloat16m1_t ax;
326
            vfloat16m1_t ay;
327
            vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
328
            for (int i = 0; i < n; i += vl) {
329
                vl = __riscv_vsetvl_e16m1(n - i);
330
                ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
331
                ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
332
                vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
333
            }
334
            vl = __riscv_vsetvlmax_e32m1();
335
            vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
336
            vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
337
            sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
338
        #else
339
            for (int i = 0; i < n; ++i) {
340
                sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
341
            }
342
        #endif // __riscv_zvfh
343
    #else
344
0
        const int np = (n & ~(GGML_F16_STEP - 1));
345
346
0
        GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
347
348
0
        GGML_F16_VEC ax[GGML_F16_ARR];
349
0
        GGML_F16_VEC ay[GGML_F16_ARR];
350
351
0
        for (int i = 0; i < np; i += GGML_F16_STEP) {
352
0
            for (int j = 0; j < GGML_F16_ARR; j++) {
353
0
                ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
354
0
                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
355
356
0
                sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
357
0
            }
358
0
        }
359
360
        // reduce sum0..sum3 to sum0
361
0
        GGML_F16_VEC_REDUCE(sumf, sum);
362
363
        // leftovers
364
0
        for (int i = np; i < n; ++i) {
365
0
            sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
366
0
        }
367
        // if you hit this, you are likely running outside the FP range
368
0
        assert(!isnan(sumf) && !isinf(sumf));
369
0
    #endif
370
#else
371
    for (int i = 0; i < n; ++i) {
372
        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
373
    }
374
#endif // GGML_SIMD
375
376
0
    *s = sumf;
377
0
}
378
379
0
void ggml_vec_silu_f32(const int n, float * y, const float * x) {
380
0
    int i = 0;
381
#if defined(__AVX512F__) && defined(__AVX512DQ__)
382
    for (; i + 15 < n; i += 16) {
383
        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
384
    }
385
#elif defined(__AVX2__) && defined(__FMA__)
386
0
    for (; i + 7 < n; i += 8) {
387
0
        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
388
0
    }
389
#elif defined(__SSE2__)
390
    for (; i + 3 < n; i += 4) {
391
        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
392
    }
393
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
394
    const int vlen = svcntw();
395
    for (; i < n; i += vlen) {
396
        const svbool_t pg = svwhilelt_b32_s32(i, n);
397
        svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
398
    }
399
#elif defined(__ARM_NEON) && defined(__aarch64__)
400
    for (; i + 3 < n; i += 4) {
401
        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
402
    }
403
#elif defined(__riscv_v_intrinsic)
404
    for (int vl; i < n; i += vl) {
405
        vl = __riscv_vsetvl_e32m2(n - i);
406
        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
407
        vfloat32m2_t vy = ggml_v_silu_m2(vx, vl);
408
        __riscv_vse32_v_f32m2(&y[i], vy, vl);
409
    }
410
#endif
411
0
    for (; i < n; ++i) {
412
0
        y[i] = ggml_silu_f32(x[i]);
413
0
    }
414
0
}
415
416
0
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
417
0
    int i = 0;
418
#if defined(__AVX512F__) && defined(__AVX512DQ__)
419
    for (; i + 15 < n; i += 16) {
420
        _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
421
    }
422
#elif defined(__AVX2__) && defined(__FMA__)
423
0
    for (; i + 7 < n; i += 8) {
424
0
        _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
425
0
    }
426
#elif defined(__SSE2__)
427
    for (; i + 3 < n; i += 4) {
428
        _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
429
    }
430
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
431
    const int vlen = svcntw();
432
    for (; i < n; i += vlen) {
433
        const svbool_t pg = svwhilelt_b32_s32(i, n);
434
        svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
435
    }
436
#elif defined(__ARM_NEON) && defined(__aarch64__)
437
    for (; i + 3 < n; i += 4) {
438
        vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
439
    }
440
#elif defined(__riscv_v_intrinsic)
441
    for (int vl; i < n; i += vl) {
442
        vl = __riscv_vsetvl_e32m2(n - i);
443
        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
444
        vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
445
        vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
446
        __riscv_vse32_v_f32m2(&y[i], vy, vl);
447
    }
448
#endif
449
0
    for (; i < n; ++i) {
450
0
        y[i] = ggml_silu_f32(x[i]) * g[i];
451
0
    }
452
0
}
453
454
0
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
455
0
    int i = 0;
456
0
    ggml_float sum = 0;
457
// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
458
// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
459
#if defined(__AVX512F__) && defined(__AVX512DQ__)
460
    for (; i + 15 < n; i += 16) {
461
        __m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
462
                                   _mm512_set1_ps(mean));
463
        _mm512_storeu_ps(y + i, val);
464
        sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
465
    }
466
#elif defined(__AVX2__) && defined(__FMA__)
467
0
    for (; i + 7 < n; i += 8) {
468
0
        __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
469
0
                                   _mm256_set1_ps(mean));
470
0
        _mm256_storeu_ps(y + i, val);
471
0
        val = _mm256_mul_ps(val,val);
472
0
        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
473
0
                                 _mm256_castps256_ps128(val));
474
0
        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
475
0
        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
476
0
        sum += (ggml_float)_mm_cvtss_f32(val2);
477
0
    }
478
#elif defined(__SSE2__)
479
    for (; i + 3 < n; i += 4) {
480
        __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
481
                                _mm_set1_ps(mean));
482
        _mm_storeu_ps(y + i, val);
483
        val = _mm_mul_ps(val, val);
484
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
485
        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
486
        val = _mm_add_ss(val, _mm_movehdup_ps(val));
487
#else
488
        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
489
        val = _mm_add_ps(val, tmp);
490
        tmp = _mm_movehl_ps(tmp, val);
491
        val = _mm_add_ss(val, tmp);
492
#endif  // __AVX__ || __AVX2__ || __AVX512F__
493
        sum += (ggml_float)_mm_cvtss_f32(val);
494
    }
495
#elif defined(__ARM_NEON) && defined(__aarch64__)
496
    for (; i + 3 < n; i += 4) {
497
        float32x4_t val = vsubq_f32(vld1q_f32(x + i),
498
                                    vdupq_n_f32(mean));
499
        vst1q_f32(y + i, val);
500
        val = vmulq_f32(val, val);
501
        sum += (ggml_float)vaddvq_f32(val);
502
    }
503
#elif defined(__VXE__) || defined(__VXE2__)
504
    for (; i + 3 < n; i += 4) {
505
        float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
506
        vec_xst(val, 0, y + i);
507
        val = vec_mul(val, val);
508
        sum += (ggml_float)vec_hsum_f32x4(val);
509
    }
510
#elif defined(__riscv_v_intrinsic)
511
    vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
512
    for (int vl; i < n; i += vl) {
513
        vl = __riscv_vsetvl_e32m2(n - i);
514
        vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
515
        __riscv_vse32_v_f32m2(&y[i], val, vl);
516
        val = __riscv_vfmul_vv_f32m2(val, val, vl);
517
        vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
518
    }
519
    sum = (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
520
#endif
521
0
    for (; i < n; ++i) {
522
0
        float val = x[i] - mean;
523
0
        y[i] = val;
524
0
        val *= val;
525
0
        sum += (ggml_float)val;
526
0
    }
527
0
    return sum/n;
528
0
}
529
530
0
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
531
0
    int i = 0;
532
0
    ggml_float sum = 0;
533
#if defined(__AVX512F__) && defined(__AVX512DQ__)
534
    for (; i + 15 < n; i += 16) {
535
        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
536
                                               _mm512_set1_ps(max)));
537
        _mm512_storeu_ps(y + i, val);
538
        sum += (ggml_float)_mm512_reduce_add_ps(val);
539
    }
540
#elif defined(__AVX2__) && defined(__FMA__)
541
0
    for (; i + 7 < n; i += 8) {
542
0
        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
543
0
                                               _mm256_set1_ps(max)));
544
0
        _mm256_storeu_ps(y + i, val);
545
0
        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
546
0
                                 _mm256_castps256_ps128(val));
547
0
        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
548
0
        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
549
0
        sum += (ggml_float)_mm_cvtss_f32(val2);
550
0
    }
551
#elif defined(__SSE2__)
552
    for (; i + 3 < n; i += 4) {
553
        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
554
                                            _mm_set1_ps(max)));
555
        _mm_storeu_ps(y + i, val);
556
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
557
        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
558
        val = _mm_add_ss(val, _mm_movehdup_ps(val));
559
#else
560
        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
561
        val = _mm_add_ps(val, tmp);
562
        tmp = _mm_movehl_ps(tmp, val);
563
        val = _mm_add_ss(val, tmp);
564
#endif
565
        sum += (ggml_float)_mm_cvtss_f32(val);
566
    }
567
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
568
    const int vlen = svcntw();
569
    for (; i < n; i += vlen) {
570
        const svbool_t pg = svwhilelt_b32_s32(i, n);
571
        svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
572
                                                svdup_n_f32_x(pg, max)));
573
        svst1_f32(pg, y + i, val);
574
        sum += (ggml_float)svaddv_f32(pg, val);
575
    }
576
#elif defined(__ARM_NEON) && defined(__aarch64__)
577
    for (; i + 3 < n; i += 4) {
578
        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
579
                                                vdupq_n_f32(max)));
580
        vst1q_f32(y + i, val);
581
        sum += (ggml_float)vaddvq_f32(val);
582
    }
583
#elif defined(__riscv_v_intrinsic)
584
    vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
585
    for (int avl; i < n; i += avl) {
586
        avl = __riscv_vsetvl_e32m2(n - i);
587
        vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
588
        __riscv_vse32_v_f32m2(&y[i], val, avl);
589
        vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
590
    }
591
    return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
592
#endif
593
0
    for (; i < n; ++i) {
594
0
        float val = expf(x[i] - max);
595
0
        sum += (ggml_float)val;
596
0
        y[i] = val;
597
0
    }
598
0
    return sum;
599
0
}
600
601
0
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
602
    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
603
604
0
    int i = 0;
605
0
    ggml_float sum = 0;
606
0
    for (; i < n; ++i) {
607
0
        float val = x[i] - max;
608
0
        y[i] = val;
609
0
        sum += (ggml_float)expf(val);
610
0
    }
611
0
    return sum = (ggml_float)logf(sum);
612
0
}