Coverage Report

Created: 2026-06-22 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/vec.h
Line
Count
Source
1
// Vectorized functions for fundamental operations
2
3
#pragma once
4
5
#include "ggml-impl.h"
6
#include "simd-mappings.h"
7
#include "ggml.h"
8
#include "ggml-cpu.h"
9
10
#if defined(GGML_USE_ACCELERATE)
11
#include <Accelerate/Accelerate.h>
12
#endif
13
14
// floating point type used to accumulate sums
15
typedef double ggml_float;
16
17
#if defined(__ARM_FEATURE_SVE)
18
inline static void ggml_sve_f16_fma_widened(
19
        svfloat32_t * acc_lo,
20
        svfloat32_t * acc_hi,
21
        svfloat16_t x,
22
        svfloat16_t y) {
23
#if defined(__ARM_FEATURE_SVE2)
24
    *acc_lo = svmlalb_f32(*acc_lo, x, y);
25
    *acc_hi = svmlalt_f32(*acc_hi, x, y);
26
#else
27
    // Plain SVE fallback path if SVE2 instructions not available
28
    svfloat16_t x_even = svtrn1_f16(x, x);
29
    svfloat16_t x_odd = svtrn2_f16(x, x);
30
31
    svfloat16_t y_even = svtrn1_f16(y, y);
32
    svfloat16_t y_odd = svtrn2_f16(y, y);
33
34
    svbool_t pg = svptrue_b32();
35
36
    *acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even));
37
    *acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd));
38
#endif
39
}
40
41
inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) {
42
    return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi));
43
}
44
#endif
45
46
#define GGML_GELU_FP16
47
#define GGML_GELU_QUICK_FP16
48
49
0
#define GGML_SOFT_MAX_UNROLL 4
50
#define GGML_VEC_DOT_UNROLL  2
51
0
#define GGML_VEC_MAD_UNROLL  32
52
53
#ifdef __cplusplus
54
extern "C" {
55
#endif
56
57
//
58
// global data
59
//
60
61
// precomputed gelu table for f16 (128 KB)
62
extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
63
64
// precomputed quick gelu table for f16 (128 KB)
65
extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
66
67
//
68
// fundamental operations
69
//
70
71
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
72
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
73
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
74
75
void ggml_vec_silu_f32(const int n, float * y, const float * x);
76
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
77
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
78
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
79
80
0
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i8
Unexecuted instantiation: vec.cpp:ggml_vec_set_i8(int, signed char*, signed char)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i8(int, signed char*, signed char)
81
0
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i16
Unexecuted instantiation: vec.cpp:ggml_vec_set_i16(int, short*, short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i16(int, short*, short)
82
83
0
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i32
Unexecuted instantiation: vec.cpp:ggml_vec_set_i32(int, int*, int)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i32(int, int*, int)
84
0
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_i32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_i32(int, int*, int const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_i32(int, int*, int const*)
85
86
0
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f16
Unexecuted instantiation: vec.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
87
0
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_bf16
Unexecuted instantiation: vec.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
Unexecuted instantiation: ops.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
88
89
0
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
90
0
    int i = 0;
91
0
#if defined(__AVX2__)
92
0
    for (; i + 7 < n; i += 8) {
93
0
        __m256 vx = _mm256_loadu_ps(x + i);
94
0
        __m256 vy = _mm256_loadu_ps(y + i);
95
0
        __m256 vz = _mm256_add_ps(vx, vy);
96
0
        _mm256_storeu_ps(z + i, vz);
97
0
    }
98
0
#endif
99
0
    for (; i < n; ++i) {
100
0
        z[i] = x[i] + y[i];
101
0
    }
102
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
103
104
0
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
105
0
    for (int i = 0; i < n; ++i) {
106
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
107
0
    }
108
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f16
Unexecuted instantiation: vec.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
109
0
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
110
0
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_acc_f32(int, float*, float const*)
111
0
inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc1_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_acc1_f32(int, float*, float)
112
0
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
113
0
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
114
0
    for (int i = 0; i < n; ++i) {
115
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
116
0
    }
117
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
118
0
inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f32
Unexecuted instantiation: vec.cpp:ggml_vec_set_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f32(int, float*, float)
119
0
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_f32(int, float*, float const*)
120
0
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f32
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f32(int, float*, float const*)
121
0
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
122
0
    for (int i = 0; i < n; ++i) {
123
0
        y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
124
0
    }
125
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f16
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
126
127
0
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
128
0
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
129
0
    for (int i = 0; i < n; ++i) {
130
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
131
0
    }
132
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
133
0
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f32
Unexecuted instantiation: vec.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
134
0
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
135
0
    for (int i = 0; i < n; ++i) {
136
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
137
0
    }
138
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f16
Unexecuted instantiation: vec.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
139
140
// compute GGML_VEC_DOT_UNROLL dot products at once
141
// xs - x row stride in bytes
142
0
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
143
0
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
144
0
145
0
    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
146
0
147
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
148
0
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
149
0
    }
150
0
151
0
#if defined(GGML_SIMD)
152
0
    #if defined(__ARM_FEATURE_SVE)
153
0
154
0
        const int ggml_f16_epr = svcnth();
155
0
        const int ggml_f16_step = 2 * ggml_f16_epr;
156
0
        int np = n - (n % ggml_f16_step);
157
0
        int np2 = n - (n % ggml_f16_epr);
158
0
159
0
        svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f);
160
0
        svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f);
161
0
        svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f);
162
0
        svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f);
163
0
        svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f);
164
0
        svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f);
165
0
        svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f);
166
0
        svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f);
167
0
168
0
        for (int i = 0; i < np; i += ggml_f16_step) {
169
0
            const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0);
170
0
            const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
171
0
            const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
172
0
173
0
            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0);
174
0
            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0);
175
0
176
0
            const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0);
177
0
            const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0);
178
0
            const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0);
179
0
180
0
            ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1);
181
0
            ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1);
182
0
        }
183
0
184
0
        for (int i = np; i < np2; i += ggml_f16_epr) {
185
0
            const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0);
186
0
            const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
187
0
            const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
188
0
189
0
            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry);
190
0
            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry);
191
0
        }
192
0
193
0
        if (np2 < n) {
194
0
            const svbool_t pg = svwhilelt_b16(np2, n);
195
0
            const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2));
196
0
            const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
197
0
            const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
198
0
199
0
            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay);
200
0
            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay);
201
0
        }
202
0
203
0
        svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo);
204
0
        svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi);
205
0
        svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo);
206
0
        svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi);
207
0
        sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi);
208
0
        sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi);
209
0
        np = n;
210
0
    #elif defined(__riscv_v_intrinsic)
211
0
        #if defined(__riscv_zvfh)
212
0
            size_t vl = __riscv_vsetvlmax_e32m4();
213
0
214
0
            // initialize accumulators to all zeroes
215
0
            vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
216
0
            vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
217
0
            vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
218
0
            vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
219
0
220
0
            // calculate step size
221
0
            const size_t epr = __riscv_vsetvlmax_e16m2();
222
0
            const size_t step = epr * 2;
223
0
            int np = (n & ~(step - 1));
224
0
225
0
            // unroll by 2 along the row dimension
226
0
            for (int i = 0; i < np; i += step) {
227
0
                vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
228
0
                vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
229
0
                vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
230
0
                vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
231
0
                vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
232
0
233
0
                vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
234
0
                vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
235
0
                vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
236
0
                vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
237
0
                vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
238
0
            }
239
0
240
0
            vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
241
0
            vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
242
0
243
0
            // leftovers
244
0
            for (int i = np; i < n; i += vl) {
245
0
                vl = __riscv_vsetvl_e16m2(n - i);
246
0
                vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
247
0
                vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
248
0
                vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
249
0
250
0
                vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
251
0
                vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
252
0
            }
253
0
254
0
            // reduce
255
0
            vl = __riscv_vsetvlmax_e32m2();
256
0
            vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
257
0
                                        __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
258
0
            vl = __riscv_vsetvlmax_e32m1();
259
0
            vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
260
0
            __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
261
0
            vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
262
0
                                        acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
263
0
264
0
            vl = __riscv_vsetvlmax_e32m2();
265
0
            vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
266
0
                                        __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
267
0
            vl = __riscv_vsetvlmax_e32m1();
268
0
            vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
269
0
                                        __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
270
0
            vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
271
0
                                        acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
272
0
            sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
273
0
            sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
274
0
            np = n;
275
0
        #else
276
0
            const int np = 0;
277
0
        #endif
278
0
    #else
279
0
        const int np = (n & ~(GGML_F16_STEP - 1));
280
0
281
0
        GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
282
0
283
0
        GGML_F16_VEC ax[GGML_F16_ARR];
284
0
        GGML_F16_VEC ay[GGML_F16_ARR];
285
0
286
0
        for (int i = 0; i < np; i += GGML_F16_STEP) {
287
0
            for (int j = 0; j < GGML_F16_ARR; j++) {
288
0
                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
289
0
290
0
                for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
291
0
                    ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
292
0
293
0
                    sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
294
0
                }
295
0
            }
296
0
        }
297
0
298
0
        // reduce sum0..sum3 to sum0
299
0
        for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
300
0
            GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
301
0
        }
302
0
    #endif
303
0
#else
304
0
    // scalar path
305
0
    const int np = 0;
306
0
#endif
307
0
    // scalar and leftovers
308
0
    for (int i = np; i < n; ++i) {
309
0
        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
310
0
            sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
311
0
        }
312
0
    }
313
0
314
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
315
0
        s[i] = (float)sumf[i];
316
0
    }
317
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_dot_f16_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
Unexecuted instantiation: ops.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
318
319
0
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
320
0
#if defined(GGML_SIMD)
321
    #if defined(__ARM_FEATURE_SVE)
322
323
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
324
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
325
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
326
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
327
328
        const int np = (n & ~(ggml_f32_step - 1));
329
        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
330
        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
331
        for (int i = 0; i < np; i += ggml_f32_step) {
332
333
            ax1 = GGML_F32_VEC_LOAD(x + i);
334
            ay1 = GGML_F32_VEC_LOAD(y + i);
335
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
336
337
            GGML_F32_VEC_STORE(y + i, ay1);
338
339
            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
340
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
341
            ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
342
343
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
344
345
            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
346
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
347
            ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
348
349
            GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
350
351
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
352
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
353
            ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
354
355
            GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
356
357
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
358
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
359
            ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
360
361
            GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
362
363
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
364
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
365
            ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
366
367
            GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
368
369
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
370
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
371
            ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
372
373
            GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
374
375
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
376
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
377
            ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
378
379
            GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
380
        }
381
        // leftovers
382
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
383
        const int np2 = (n & ~(ggml_f32_epr - 1));
384
        for (int i = np; i < np2; i += ggml_f32_epr) {
385
            ax1 = GGML_F32_VEC_LOAD(x + i);
386
            ay1 = GGML_F32_VEC_LOAD(y + i);
387
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
388
389
            GGML_F32_VEC_STORE(y + i, ay1);
390
        }
391
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
392
        if (np2 < n) {
393
            svbool_t pg =svwhilelt_b32(np2, n);
394
            ax1 = svld1_f32(pg, x + np2);
395
            ay1 = svld1_f32(pg, y + np2);
396
            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
397
398
            svst1_f32(pg, y + np2, ay1);
399
        }
400
    #elif defined(__riscv_v_intrinsic)
401
        for (int i = 0, avl; i < n; i += avl) {
402
            avl = __riscv_vsetvl_e32m8(n - i);
403
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
404
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
405
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
406
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
407
        }
408
    #else
409
0
        const int np = (n & ~(GGML_F32_STEP - 1));
410
411
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
412
413
0
        GGML_F32_VEC ax[GGML_F32_ARR];
414
0
        GGML_F32_VEC ay[GGML_F32_ARR];
415
416
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
417
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
418
0
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
419
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
420
0
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
421
422
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
423
0
            }
424
0
        }
425
426
        // leftovers
427
0
        for (int i = np; i < n; ++i) {
428
0
            y[i] += x[i]*v;
429
0
        }
430
0
    #endif
431
#else
432
    // scalar
433
    for (int i = 0; i < n; ++i) {
434
        y[i] += x[i]*v;
435
    }
436
#endif
437
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
438
439
0
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
440
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
441
    const int sve_register_length = svcntb() * 8;
442
    const int ggml_f16_epr = sve_register_length / 16;
443
    const int ggml_f16_step = 8 * ggml_f16_epr;
444
445
    GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
446
447
    int np = (n & ~(ggml_f16_step - 1));
448
449
    svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
450
    svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
451
    for (int i = 0; i < np; i += ggml_f16_step) {
452
        ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
453
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
454
        ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
455
456
        GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
457
458
        ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
459
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
460
        ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
461
462
        GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
463
464
        ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
465
        ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
466
        ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
467
468
        GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
469
470
        ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
471
        ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
472
        ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
473
474
        GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
475
476
        ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
477
        ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
478
        ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
479
480
        GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
481
482
        ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
483
        ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
484
        ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
485
486
        GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
487
488
        ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
489
        ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
490
        ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
491
492
        GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
493
494
        ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
495
        ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
496
        ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
497
498
        GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
499
    }
500
    const int np2 = (n & ~(ggml_f16_epr - 1));
501
    for (int k = np; k < np2; k += ggml_f16_epr) {
502
        svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
503
        svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
504
        ry = GGML_F16x_VEC_FMA(ry, rx, vx);
505
506
        GGML_F16x_VEC_STORE(y + k, ry, 0);
507
    }
508
509
    if (np2 < n) {
510
        svbool_t pg = svwhilelt_b16(np2, n);
511
        svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
512
        svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
513
        hy = svmad_f16_x(pg, hx, vx, hy);
514
        svst1_f16(pg, (__fp16 *)(y + np2), hy);
515
    }
516
    np = n;
517
#elif defined(__riscv_v_intrinsic) // implies __riscv_v_intrinsic
518
    #if defined (__riscv_zvfh)
519
        const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
520
        const _Float16 scale = *(const _Float16*)(&s);
521
522
        // calculate step size
523
        const int epr = __riscv_vsetvlmax_e16m4();
524
        const int step = epr * 2;
525
        int np = (n & ~(step - 1));
526
527
        // unroll by 2
528
        for (int i = 0; i < np; i += step) {
529
            vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
530
            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
531
            ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
532
            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
533
            __asm__ __volatile__ ("" ::: "memory");
534
535
            vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
536
            vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
537
            ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
538
            __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
539
            __asm__ __volatile__ ("" ::: "memory");
540
        }
541
542
        // leftovers
543
        int vl;
544
        for (int i = np; i < n; i += vl) {
545
            vl = __riscv_vsetvl_e16m4(n - i);
546
            vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
547
            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
548
            ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
549
            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
550
        }
551
        np = n;
552
    #else
553
        // fall to scalar path
554
        const int np = 0;
555
    #endif
556
#elif defined(GGML_SIMD)
557
0
    const int np = (n & ~(GGML_F16_STEP - 1));
558
559
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
560
561
0
    GGML_F16_VEC ax[GGML_F16_ARR];
562
0
    GGML_F16_VEC ay[GGML_F16_ARR];
563
564
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
565
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
566
0
            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
567
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
568
0
            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
569
570
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
571
0
        }
572
0
    }
573
#else
574
    // scalar path
575
    const int np = 0;
576
#endif
577
578
    // scalar and leftovers
579
0
    for (int i = np; i < n; ++i) {
580
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
581
0
    }
582
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
583
584
// xs and vs are byte strides of x and v
585
0
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
586
587
0
    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
588
0
    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
589
590
0
    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
591
0
        x[i] = (const float *) ((const char *) xv + i*xs);
592
0
        v[i] = (const float *) ((const char *) vv + i*vs);
593
0
    }
594
595
0
#if defined(GGML_SIMD)
596
    #if defined(__ARM_FEATURE_SVE)
597
        // scalar Route to scalar implementation       //TODO: Write SVE code
598
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
599
            for (int i = 0; i < n; ++i) {
600
                y[i] += x[k][i]*v[k][0];
601
            }
602
        }
603
    #elif defined(__riscv_v_intrinsic)
604
        for (int i = 0, avl; i < n; i += avl) {
605
            avl = __riscv_vsetvl_e32m8(n - i);
606
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
607
            for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
608
                vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
609
                ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
610
            }
611
            __riscv_vse32_v_f32m8(&y[i], ay, avl);
612
        }
613
    #else
614
0
        const int np = (n & ~(GGML_F32_STEP - 1));
615
616
0
        GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
617
618
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
619
0
            vx[k] = GGML_F32_VEC_SET1(v[k][0]);
620
0
        }
621
622
0
        GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
623
0
        GGML_F32_VEC ay[GGML_F32_ARR];
624
625
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
626
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
627
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
628
629
0
                for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
630
0
                    ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
631
0
                    ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
632
0
                }
633
634
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
635
0
            }
636
0
        }
637
638
        // leftovers
639
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
640
0
            for (int i = np; i < n; ++i) {
641
0
                y[i] += x[k][i]*v[k][0];
642
0
            }
643
0
        }
644
0
    #endif
645
#else
646
    // scalar
647
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
648
        for (int i = 0; i < n; ++i) {
649
            y[i] += x[k][i]*v[k][0];
650
        }
651
    }
652
#endif
653
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
654
655
0
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
656
#if defined(GGML_USE_ACCELERATE)
657
    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
658
#elif defined(GGML_SIMD)
659
    #if defined(__ARM_FEATURE_SVE)
660
        // scalar ; TODO: Write SVE code
661
        for (int i = 0; i < n; ++i) {
662
            y[i] = x[i]*s + b;
663
        }
664
    #elif defined(__riscv_v_intrinsic)
665
        for (int i = 0, avl; i < n; i += avl) {
666
            avl = __riscv_vsetvl_e32m8(n - i);
667
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
668
            vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
669
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
670
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
671
        }
672
    #else
673
0
        const int np = (n & ~(GGML_F32_STEP - 1));
674
675
0
        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
676
0
        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
677
678
0
        GGML_F32_VEC ay[GGML_F32_ARR];
679
680
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
681
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
682
0
                ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
683
0
                ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
684
685
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
686
0
            }
687
0
        }
688
689
        // leftovers
690
0
        for (int i = np; i < n; ++i) {
691
0
            y[i] = x[i]*s + b;
692
0
        }
693
0
    #endif
694
#else
695
    // scalar
696
    for (int i = 0; i < n; ++i) {
697
        y[i] = x[i]*s + b;
698
    }
699
#endif
700
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
701
702
//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
703
0
inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
704
#if defined(GGML_USE_ACCELERATE)
705
    vDSP_vsmul(y, 1, &v, y, 1, n);
706
#elif defined(GGML_SIMD)
707
    #if defined(__ARM_FEATURE_SVE)
708
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
709
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
710
        const int ggml_f32_step = 2 * ggml_f32_epr;
711
712
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
713
        const int np = (n & ~(ggml_f32_step - 1));
714
        svfloat32_t ay1;
715
        svfloat32_t ay2;
716
        for (int i = 0; i < np; i += ggml_f32_step) {
717
            ay1 = GGML_F32_VEC_LOAD(y + i);
718
            ay1 = GGML_F32_VEC_MUL(ay1, vx);
719
            GGML_F32_VEC_STORE(y + i, ay1);
720
721
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
722
            ay2 = GGML_F32_VEC_MUL(ay2, vx);
723
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
724
        }
725
        // leftovers
726
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
727
        for (int i = np; i < n; i += ggml_f32_epr) {
728
            svbool_t pg = svwhilelt_b32(i, n);
729
            ay1 = svld1_f32(pg, y + i);
730
            ay1 = svmul_f32_m(pg, ay1, vx);
731
            svst1_f32(pg, y + i, ay1);
732
        }
733
    #elif defined(__riscv_v_intrinsic)
734
        for (int i = 0, avl; i < n; i += avl) {
735
            avl = __riscv_vsetvl_e32m8(n - i);
736
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
737
            vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
738
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
739
        }
740
    #else
741
0
        const int np = (n & ~(GGML_F32_STEP - 1));
742
743
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
744
745
0
        GGML_F32_VEC ay[GGML_F32_ARR];
746
747
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
748
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
749
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
750
0
                ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
751
752
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
753
0
            }
754
0
        }
755
756
        // leftovers
757
0
        for (int i = np; i < n; ++i) {
758
0
            y[i] *= v;
759
0
        }
760
0
    #endif
761
#else
762
    // scalar
763
    for (int i = 0; i < n; ++i) {
764
        y[i] *= v;
765
    }
766
#endif
767
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f32
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f32(int, float*, float)
768
769
0
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
770
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
771
    const int sve_register_length = svcntb() * 8;
772
    const int ggml_f16_epr = sve_register_length / 16;
773
    const int ggml_f16_step = 2 * ggml_f16_epr;
774
775
    GGML_F16x_VEC vx =  GGML_F16x_VEC_SET1(v);
776
    int np = (n & ~(ggml_f16_step - 1));
777
    svfloat16_t ay1, ay2;
778
779
    for (int i = 0; i < np; i += ggml_f16_step) {
780
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
781
        ay1 = GGML_F16x_VEC_MUL(ay1, vx);
782
        GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
783
784
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
785
        ay2 = GGML_F16x_VEC_MUL(ay2, vx);
786
        GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
787
    }
788
    // leftovers
789
    // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
790
    if (np < n) {
791
        svbool_t pg = svwhilelt_b16(np, n);
792
        svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
793
        svfloat16_t out = svmul_f16_m(pg, hy, vx);
794
        svst1_f16(pg, (__fp16 *)(y + np), out);
795
    }
796
    np = n;
797
#elif defined(__riscv_v_intrinsic)
798
    #if defined(__riscv_zvfh)
799
        const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
800
        const _Float16 scale = *(const _Float16*)(&s);
801
802
        // calculate step size
803
        const int epr = __riscv_vsetvlmax_e16m4();
804
        const int step = epr * 2;
805
        int np = (n & ~(step - 1));
806
807
        // unroll by 2
808
        for (int i = 0; i < np; i += step) {
809
            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
810
            ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
811
            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
812
            __asm__ __volatile__ ("" ::: "memory");
813
814
            vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
815
            ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
816
            __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
817
            __asm__ __volatile__ ("" ::: "memory");
818
        }
819
820
        // leftovers
821
        int vl;
822
        for (int i = np; i < n; i += vl) {
823
            vl = __riscv_vsetvl_e16m4(n - i);
824
            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
825
            ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
826
            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
827
        }
828
        np = n;
829
    #else
830
        // fall to scalar path
831
        const int np = 0;
832
    #endif
833
#elif defined(GGML_SIMD)
834
0
    const int np = (n & ~(GGML_F16_STEP - 1));
835
836
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
837
838
0
    GGML_F16_VEC ay[GGML_F16_ARR];
839
840
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
841
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
842
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
843
0
            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
844
845
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
846
0
        }
847
0
    }
848
#else
849
    // scalar path
850
    const int np = 0;
851
#endif
852
    // scalar and leftovers
853
0
    for (int i = np; i < n; ++i) {
854
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
855
0
    }
856
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f16
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
857
858
0
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_f32(int, float*, float const*)
859
0
inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f32(int, float*, float const*)
860
0
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
861
0
    for (int i = 0; i < n; ++i) {
862
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
863
0
        y[i] = GGML_CPU_FP32_TO_FP16(v*v);
864
0
    }
865
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
866
0
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
867
0
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
868
0
    for (int i = 0; i < n; ++i) {
869
0
        y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
870
0
    }
871
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
872
0
inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f32
Unexecuted instantiation: vec.cpp:ggml_vec_log_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f32(int, float*, float const*)
873
0
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
874
0
    for (int i = 0; i < n; ++i) {
875
0
        y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
876
0
    }
877
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f16
Unexecuted instantiation: vec.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
878
0
inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f32(int, float*, float const*)
879
0
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
880
0
    for (int i = 0; i < n; ++i) {
881
0
        y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
882
0
    }
883
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
884
0
inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f32(int, float*, float const*)
885
0
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
886
0
    for (int i = 0; i < n; ++i) {
887
0
        y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
888
0
    }
889
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f16
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
890
0
inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f32
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f32(int, float*, float const*)
891
0
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
892
0
    for (int i = 0; i < n; ++i) {
893
0
        y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
894
0
    }
895
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f16
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
896
0
inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f32(int, float*, float const*)
897
0
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
898
0
    for (int i = 0; i < n; ++i) {
899
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
900
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
901
0
    }
902
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
903
0
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f32
Unexecuted instantiation: vec.cpp:ggml_vec_step_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f32(int, float*, float const*)
904
0
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
905
0
    for (int i = 0; i < n; ++i) {
906
0
        y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
907
0
    }
908
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f16
Unexecuted instantiation: vec.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
909
0
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f32
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f32(int, float*, float const*)
910
0
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
911
0
    for (int i = 0; i < n; ++i) {
912
0
        y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
913
0
    }
914
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f16
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
915
0
inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f32(int, float*, float const*)
916
0
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
917
0
    for (int i = 0; i < n; ++i) {
918
0
        const float v = GGML_CPU_FP16_TO_FP32(x[i]);
919
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
920
0
    }
921
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
922
0
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f32(int, float*, float const*)
923
0
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
924
0
    for (int i = 0; i < n; ++i) {
925
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
926
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
927
0
    }
928
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
929
0
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
930
0
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
931
0
    for (int i = 0; i < n; ++i) {
932
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
933
0
        y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
934
0
    }
935
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
936
0
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
937
0
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
938
0
    for (int i = 0; i < n; ++i) {
939
0
        y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
940
0
    }
941
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
942
// TODO: optimize performance
943
0
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
944
0
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
945
0
    for (int i = 0; i < n; ++i) {
946
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
947
0
        y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
948
0
    }
949
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
950
0
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
951
0
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
952
0
    for (int i = 0; i < n; ++i) {
953
0
        y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
954
0
    }
955
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
956
0
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f32
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f32(int, float*, float const*)
957
0
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
958
0
    for (int i = 0; i < n; ++i) {
959
0
        y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
960
0
    }
961
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f16
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
962
963
static const float GELU_COEF_A     = 0.044715f;
964
static const float GELU_QUICK_COEF = -1.702f;
965
static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
966
static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;
967
968
65.5k
inline static float ggml_gelu_f32(float x) {
969
65.5k
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
970
65.5k
}
ggml-cpu.c:ggml_gelu_f32
Line
Count
Source
968
65.5k
inline static float ggml_gelu_f32(float x) {
969
65.5k
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
970
65.5k
}
Unexecuted instantiation: vec.cpp:ggml_gelu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_f32(float)
971
972
0
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
973
0
    const uint16_t * i16 = (const uint16_t *) x;
974
0
    for (int i = 0; i < n; ++i) {
975
0
        y[i] = ggml_table_gelu_f16[i16[i]];
976
0
    }
977
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
978
979
0
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
980
0
    for (int i = 0; i < n; ++i) {
981
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
982
0
        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
983
0
        y[i] = GGML_CPU_FP32_TO_FP16(res);
984
0
    }
985
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
986
987
#ifdef GGML_GELU_FP16
988
0
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
989
0
    uint16_t t;
990
0
    for (int i = 0; i < n; ++i) {
991
0
        if (x[i] <= -10.0f) {
992
0
            y[i] = 0.0f;
993
0
        } else if (x[i] >= 10.0f) {
994
0
            y[i] = x[i];
995
0
        } else {
996
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
997
0
            memcpy(&t, &fp16, sizeof(uint16_t));
998
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
999
0
        }
1000
0
    }
1001
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f32(int, float*, float const*)
1002
#else
1003
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1004
    for (int i = 0; i < n; ++i) {
1005
        y[i] = ggml_gelu_f32(x[i]);
1006
    }
1007
}
1008
#endif
1009
1010
0
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
1011
0
    for (int i = 0; i < n; ++i) {
1012
0
        float xi = x[i];
1013
0
        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
1014
0
    }
1015
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
1016
1017
65.5k
inline static float ggml_gelu_quick_f32(float x) {
1018
65.5k
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
1019
65.5k
}
ggml-cpu.c:ggml_gelu_quick_f32
Line
Count
Source
1017
65.5k
inline static float ggml_gelu_quick_f32(float x) {
1018
65.5k
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
1019
65.5k
}
Unexecuted instantiation: vec.cpp:ggml_gelu_quick_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_quick_f32(float)
1020
1021
0
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1022
0
    const uint16_t * i16 = (const uint16_t *) x;
1023
0
    for (int i = 0; i < n; ++i) {
1024
0
        y[i] = ggml_table_gelu_quick_f16[i16[i]];
1025
0
    }
1026
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
1027
1028
#ifdef GGML_GELU_QUICK_FP16
1029
0
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1030
0
    uint16_t t;
1031
0
    for (int i = 0; i < n; ++i) {
1032
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1033
0
        memcpy(&t, &fp16, sizeof(uint16_t));
1034
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
1035
0
    }
1036
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
1037
#else
1038
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1039
    for (int i = 0; i < n; ++i) {
1040
        y[i] = ggml_gelu_quick_f32(x[i]);
1041
    }
1042
}
1043
#endif
1044
1045
// Sigmoid Linear Unit (SiLU) function
1046
0
inline static float ggml_silu_f32(float x) {
1047
0
    return x/(1.0f + expf(-x));
1048
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f32
Unexecuted instantiation: vec.cpp:ggml_silu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_silu_f32(float)
1049
0
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
1050
0
    float v = GGML_CPU_FP16_TO_FP32(x);
1051
0
    return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
1052
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f16
Unexecuted instantiation: vec.cpp:ggml_silu_f16(unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_f16(unsigned short)
1053
1054
#if __FINITE_MATH_ONLY__
1055
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
1056
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
1057
#endif
1058
1059
/* Below function was borrowed from the GitHub repository:
1060
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
1061
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1062
    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
1063
        // Constants
1064
        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
1065
        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
1066
        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
1067
        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
1068
        const svfloat32_t one = svdup_n_f32(1.0f);
1069
        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
1070
        const svint32_t inactive2 = svdup_n_s32(0);
1071
1072
        // Algorithm starts here
1073
        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
1074
        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
1075
        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
1076
1077
        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
1078
        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
1079
1080
        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
1081
        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
1082
        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
1083
1084
        // and_(t2.d, t1.d, not_mask17.d)
1085
        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
1086
        t5 = svsub_f32_m(pg, t1, t5);                // z
1087
        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
1088
        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
1089
        t0 = svmul_f32_m(pg, t0, t4);                // Final result
1090
1091
        return t0;
1092
    }
1093
#endif
1094
1095
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1096
1097
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1098
    const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1099
    const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1100
    const svfloat32_t n = svsub_f32_x(pg, z, r);
1101
    const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1102
    const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1103
    const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1104
    const svbool_t c = svacgt_n_f32(pg, n, 126);
1105
    const svfloat32_t u = svmul_f32_x(pg, b, b);
1106
    const svfloat32_t j = svmla_f32_x(pg,
1107
        svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1108
        svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1109
                        svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1110
    const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1111
    const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1112
    const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1113
    return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1114
                     svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1115
}
1116
1117
// computes silu x/(1+exp(-x)) in single precision vector
1118
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1119
    const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1120
    const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1121
    const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1122
    const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1123
    const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1124
    return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1125
}
1126
1127
#elif defined(__ARM_NEON) && defined(__aarch64__)
1128
1129
// adapted from arm limited optimized routine
1130
// the maximum error is 1.45358 plus 0.5 ulps
1131
// numbers above 88.38 will flush to infinity
1132
// numbers beneath -103.97 will flush to zero
1133
inline static float32x4_t ggml_v_expf(float32x4_t x) {
1134
    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
1135
    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
1136
    const float32x4_t n = vsubq_f32(z, r);
1137
    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
1138
                                    vdupq_n_f32(0x1.7f7d1cp-20f));
1139
    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
1140
    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
1141
    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
1142
    const float32x4_t u = vmulq_f32(b, b);
1143
    const float32x4_t j = vfmaq_f32(
1144
        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
1145
        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
1146
                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
1147
    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1148
        return vfmaq_f32(k, j, k);
1149
    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
1150
    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
1151
    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1152
    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
1153
                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1154
}
1155
1156
// computes silu x/(1+exp(-x)) in single precision vector
1157
inline static float32x4_t ggml_v_silu(float32x4_t x) {
1158
    const float32x4_t one = vdupq_n_f32(1.0f);
1159
    const float32x4_t zero = vdupq_n_f32(0.0f);
1160
    const float32x4_t neg_x = vsubq_f32(zero, x);
1161
    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1162
    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1163
    return vdivq_f32(x, one_plus_exp_neg_x);
1164
}
1165
1166
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
1167
1168
// adapted from arm limited optimized routine
1169
// the maximum error is 1.45358 plus 0.5 ulps
1170
// numbers above 88.38 will flush to infinity
1171
// numbers beneath -103.97 will flush to zero
1172
inline static __m512 ggml_v_expf(__m512 x) {
1173
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
1174
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
1175
  const __m512 n = _mm512_sub_ps(z, r);
1176
  const __m512 b =
1177
      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
1178
                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
1179
  const __mmask16 d =
1180
      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
1181
  const __m512 u = _mm512_mul_ps(b, b);
1182
  const __m512 j = _mm512_fmadd_ps(
1183
      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
1184
                                      _mm512_set1_ps(0x1.573e2ep-5f)),
1185
                      u,
1186
                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
1187
                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
1188
      u,
1189
      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
1190
  const __m512 res = _mm512_scalef_ps(j, n);
1191
  if (_mm512_kortestz(d, d))
1192
    return res;
1193
  const __m512 zero = _mm512_setzero_ps();
1194
  const __m512 alt = _mm512_mask_blend_ps(
1195
      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1196
  return _mm512_mask_blend_ps(d, res, alt);
1197
}
1198
1199
// computes silu x/(1+exp(-x)) in single precision vector
1200
inline static __m512 ggml_v_silu(__m512 x) {
1201
    const __m512 one = _mm512_set1_ps(1);
1202
    const __m512 zero = _mm512_setzero_ps();
1203
    const __m512 neg_x = _mm512_sub_ps(zero, x);
1204
    const __m512 exp_neg_x = ggml_v_expf(neg_x);
1205
    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1206
    return _mm512_div_ps(x, one_plus_exp_neg_x);
1207
}
1208
1209
#elif defined(__AVX2__) && defined(__FMA__)
1210
1211
// adapted from arm limited optimized routine
1212
// the maximum error is 1.45358 plus 0.5 ulps
1213
// numbers above 88.38 will flush to infinity
1214
// numbers beneath -103.97 will flush to zero
1215
0
inline static __m256 ggml_v_expf(__m256 x) {
1216
0
  const __m256 r = _mm256_set1_ps(0x1.8p23f);
1217
0
  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
1218
0
  const __m256 n = _mm256_sub_ps(z, r);
1219
0
  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
1220
0
                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
1221
0
  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
1222
0
  const __m256 k = _mm256_castsi256_ps(
1223
0
      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
1224
0
  const __m256i c = _mm256_castps_si256(
1225
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1226
0
                    _mm256_set1_ps(126), _CMP_GT_OQ));
1227
0
  const __m256 u = _mm256_mul_ps(b, b);
1228
0
  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
1229
0
                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
1230
0
                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
1231
0
                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
1232
0
                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
1233
0
  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
1234
0
    return _mm256_fmadd_ps(j, k, k);
1235
0
  const __m256i g = _mm256_and_si256(
1236
0
      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1237
0
      _mm256_set1_epi32(0x82000000u));
1238
0
  const __m256 s1 =
1239
0
      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
1240
0
  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
1241
0
  const __m256i d = _mm256_castps_si256(
1242
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1243
0
                    _mm256_set1_ps(192), _CMP_GT_OQ));
1244
0
  return _mm256_or_ps(
1245
0
      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
1246
0
      _mm256_andnot_ps(
1247
0
          _mm256_castsi256_ps(d),
1248
0
          _mm256_or_ps(
1249
0
              _mm256_and_ps(_mm256_castsi256_ps(c),
1250
0
                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
1251
0
              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
1252
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_expf
Unexecuted instantiation: vec.cpp:ggml_v_expf(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_expf(float __vector(8))
1253
1254
// computes silu x/(1+exp(-x)) in single precision vector
1255
0
inline static __m256 ggml_v_silu(__m256 x) {
1256
0
    const __m256 one = _mm256_set1_ps(1);
1257
0
    const __m256 zero = _mm256_setzero_ps();
1258
0
    const __m256 neg_x = _mm256_sub_ps(zero, x);
1259
0
    const __m256 exp_neg_x = ggml_v_expf(neg_x);
1260
0
    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
1261
0
    return _mm256_div_ps(x, one_plus_exp_neg_x);
1262
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_silu
Unexecuted instantiation: vec.cpp:ggml_v_silu(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_silu(float __vector(8))
1263
1264
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1265
1266
#if defined(__FMA__)
1267
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1268
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1269
#else
1270
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1271
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1272
#endif
1273
1274
// adapted from arm limited optimized routine
1275
// the maximum error is 1.45358 plus 0.5 ulps
1276
// numbers above 88.38 will flush to infinity
1277
// numbers beneath -103.97 will flush to zero
1278
inline static __m128 ggml_v_expf(__m128 x) {
1279
    const __m128 r = _mm_set1_ps(0x1.8p23f);
1280
    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
1281
    const __m128 n = _mm_sub_ps(z, r);
1282
    const __m128 b =
1283
        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
1284
    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
1285
    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
1286
    const __m128i c =
1287
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
1288
    const __m128 u = _mm_mul_ps(b, b);
1289
    const __m128 j =
1290
        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
1291
                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
1292
                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
1293
    if (!_mm_movemask_epi8(c))
1294
        return MADD128(j, k, k);
1295
    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1296
                                    _mm_set1_epi32(0x82000000u));
1297
    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
1298
    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1299
    const __m128i d =
1300
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
1301
    return _mm_or_ps(
1302
        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1303
        _mm_andnot_ps(_mm_castsi128_ps(d),
1304
                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1305
                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1306
}
1307
1308
// computes silu x/(1+exp(-x)) in single precision vector
1309
inline static __m128 ggml_v_silu(__m128 x) {
1310
    const __m128 one = _mm_set1_ps(1);
1311
    const __m128 zero = _mm_setzero_ps();
1312
    const __m128 neg_x = _mm_sub_ps(zero, x);
1313
    const __m128 exp_neg_x = ggml_v_expf(neg_x);
1314
    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1315
    return _mm_div_ps(x, one_plus_exp_neg_x);
1316
}
1317
1318
#elif defined(__riscv_v_intrinsic)
1319
1320
// adapted from arm limited optimized routine
1321
// the maximum error is 1.45358 plus 0.5 ulps
1322
// numbers above 88.38 will flush to infinity
1323
// numbers beneath -103.97 will flush to zero
1324
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1325
    const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1326
#ifdef __riscv_xtheadvector
1327
    // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1328
    vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1329
    z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1330
#else
1331
    const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1332
#endif
1333
    const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1334
    const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1335
                                                    0x1.7f7d1cp-20f, n, vl);
1336
    const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1337
    const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1338
    const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1339
    const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1340
    const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1341
        __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1342
        __riscv_vfmacc_vv_f32m2(
1343
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1344
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1345
            u, vl), u, vl);
1346
    if (!__riscv_vcpop_m_b16(c, vl))
1347
        return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1348
    const vbool16_t  dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1349
    const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1350
    const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1351
    const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1352
    const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1353
        __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1354
        __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1355
        c, vl);
1356
    return __riscv_vmerge_vvm_f32m2(
1357
        r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1358
        __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1359
        vl);
1360
}
1361
1362
// computes silu x/(1+exp(-x)) in single precision vector
1363
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1364
    const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1365
    const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1366
    const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1367
    return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1368
}
1369
1370
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1371
1372
0
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1373
0
    for (int i = 0; i < n; ++i) {
1374
0
        y[i] = ggml_silu_f16(x[i]);
1375
0
    }
1376
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
1377
1378
0
inline static float ggml_silu_backward_f32(float x, float dy) {
1379
0
    const float s = 1.0f/(1.0f + expf(-x));
1380
0
    return dy*s*(1.0f + x*(1.0f - s));
1381
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f32(float, float)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f32(float, float)
1382
1383
0
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1384
0
    const float v = GGML_CPU_FP16_TO_FP32(x);
1385
0
    const float s = 1.0f/(1.0f + expf(-v));
1386
0
    return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1387
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
1388
1389
0
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1390
0
    for (int i = 0; i < n; ++i) {
1391
0
        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
1392
0
    }
1393
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
1394
1395
0
inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1396
0
    for (int i = 0; i < n; ++i) {
1397
0
        dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
1398
0
    }
1399
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1400
1401
0
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1402
0
    for (int i = 0; i < n; ++i) {
1403
0
        y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1404
0
    }
1405
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
1406
1407
0
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1408
0
    for (int i = 0; i < n; ++i) {
1409
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
1410
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1411
0
    }
1412
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1413
1414
#ifdef GGML_GELU_FP16
1415
0
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1416
0
    uint16_t t;
1417
0
    for (int i = 0; i < n; ++i) {
1418
0
        if (x[i] <= -10.0f) {
1419
0
            y[i] = 0.0f;
1420
0
        } else if (x[i] >= 10.0f) {
1421
0
            y[i] = x[i] * g[i];
1422
0
        } else {
1423
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1424
0
            memcpy(&t, &fp16, sizeof(uint16_t));
1425
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1426
0
        }
1427
0
    }
1428
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
1429
#else
1430
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1431
    for (int i = 0; i < n; ++i) {
1432
        y[i] = ggml_gelu_f32(x[i]) * g[i];
1433
    }
1434
}
1435
#endif
1436
1437
0
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1438
0
    const uint16_t * i16 = (const uint16_t *) x;
1439
0
    for (int i = 0; i < n; ++i) {
1440
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1441
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1442
0
    }
1443
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1444
1445
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1446
1447
0
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1448
0
    for (int i = 0; i < n; ++i) {
1449
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1450
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1451
0
        y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1452
0
    }
1453
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_swiglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1454
1455
0
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1456
0
    for (int i = 0; i < n; ++i) {
1457
0
        float xi = x[i];
1458
0
        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1459
0
    }
1460
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
1461
1462
0
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1463
0
    for (int i = 0; i < n; ++i) {
1464
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1465
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1466
0
        y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1467
0
    }
1468
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1469
1470
#ifdef GGML_GELU_QUICK_FP16
1471
0
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1472
0
    uint16_t t;
1473
0
    for (int i = 0; i < n; ++i) {
1474
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1475
0
        memcpy(&t, &fp16, sizeof(uint16_t));
1476
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1477
0
    }
1478
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
1479
#else
1480
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1481
    for (int i = 0; i < n; ++i) {
1482
        y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1483
    }
1484
}
1485
#endif
1486
1487
0
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1488
0
    const uint16_t * i16 = (const uint16_t *) x;
1489
0
    for (int i = 0; i < n; ++i) {
1490
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1491
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1492
0
    }
1493
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1494
1495
0
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1496
0
#ifndef GGML_USE_ACCELERATE
1497
0
    ggml_float sum = 0.0;
1498
0
    for (int i = 0; i < n; ++i) {
1499
0
        sum += (ggml_float)x[i];
1500
0
    }
1501
0
    *s = (float)sum;
1502
#else
1503
    vDSP_sve(x, 1, s, n);
1504
#endif
1505
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32(int, float*, float const*)
1506
1507
0
inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
1508
0
    for (int i = 0; i < n; ++i) {
1509
0
        if (i == 0) {
1510
0
            y[i] = x[i];
1511
0
        } else {
1512
0
            y[i] = y[i - 1] + x[i];
1513
0
        }
1514
0
    }
1515
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cumsum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
1516
1517
0
inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1518
0
    ggml_float sum = 0.0;
1519
0
    for (int i = 0; i < n; ++i) {
1520
0
        sum += (ggml_float)x[i];
1521
0
    }
1522
0
    *s = sum;
1523
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
1524
1525
0
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1526
0
    float sum = 0.0f;
1527
0
    for (int i = 0; i < n; ++i) {
1528
0
        sum += GGML_CPU_FP16_TO_FP32(x[i]);
1529
0
    }
1530
0
    *s = sum;
1531
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
1532
1533
0
inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1534
0
    float sum = 0.0f;
1535
0
    for (int i = 0; i < n; ++i) {
1536
0
        sum += GGML_BF16_TO_FP32(x[i]);
1537
0
    }
1538
0
    *s = sum;
1539
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_bf16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
1540
1541
0
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1542
0
#ifndef GGML_USE_ACCELERATE
1543
0
    float max = -INFINITY;
1544
0
    for (int i = 0; i < n; ++i) {
1545
0
        max = MAX(max, x[i]);
1546
0
    }
1547
0
    *s = max;
1548
#else
1549
    vDSP_maxv(x, 1, s, n);
1550
#endif
1551
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_max_f32
Unexecuted instantiation: vec.cpp:ggml_vec_max_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_max_f32(int, float*, float const*)
1552
1553
0
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1554
0
    ggml_vec_norm_f32(n, s, x);
1555
0
    *s = 1.f/(*s);
1556
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_inv_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
1557
1558
0
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1559
0
    float max = -INFINITY;
1560
0
    int idx = 0;
1561
0
    for (int i = 0; i < n; ++i) {
1562
0
        max = MAX(max, x[i]);
1563
0
        if (max == x[i]) { idx = i; }
1564
0
    }
1565
0
    *s = idx;
1566
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_argmax_f32
Unexecuted instantiation: vec.cpp:ggml_vec_argmax_f32(int, int*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_argmax_f32(int, int*, float const*)
1567
1568
#ifdef __cplusplus
1569
}
1570
#endif