Coverage Report

Created: 2026-01-10 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/vec.h
Line
Count
Source
1
// Vectorized functions for fundamental operations
2
3
#pragma once
4
5
#include "ggml-impl.h"
6
#include "simd-mappings.h"
7
#include "ggml.h"
8
#include "ggml-cpu.h"
9
10
#if defined(GGML_USE_ACCELERATE)
11
#include <Accelerate/Accelerate.h>
12
#endif
13
14
// floating point type used to accumulate sums
15
typedef double ggml_float;
16
17
#define GGML_GELU_FP16
18
#define GGML_GELU_QUICK_FP16
19
20
0
#define GGML_SOFT_MAX_UNROLL 4
21
#define GGML_VEC_DOT_UNROLL  2
22
0
#define GGML_VEC_MAD_UNROLL  32
23
24
#ifdef __cplusplus
25
extern "C" {
26
#endif
27
28
//
29
// global data
30
//
31
32
// precomputed gelu table for f16 (128 KB)
33
extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
34
35
// precomputed quick gelu table for f16 (128 KB)
36
extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
37
38
//
39
// fundamental operations
40
//
41
42
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
43
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
44
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
46
void ggml_vec_silu_f32(const int n, float * y, const float * x);
47
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
48
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
49
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
50
51
0
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i8
Unexecuted instantiation: vec.cpp:ggml_vec_set_i8(int, signed char*, signed char)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i8(int, signed char*, signed char)
52
0
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i16
Unexecuted instantiation: vec.cpp:ggml_vec_set_i16(int, short*, short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i16(int, short*, short)
53
54
0
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i32
Unexecuted instantiation: vec.cpp:ggml_vec_set_i32(int, int*, int)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i32(int, int*, int)
55
0
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_i32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_i32(int, int*, int const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_i32(int, int*, int const*)
56
57
0
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f16
Unexecuted instantiation: vec.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
58
0
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_bf16
Unexecuted instantiation: vec.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
Unexecuted instantiation: ops.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
59
60
0
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
61
0
    int i = 0;
62
0
#if defined(__AVX2__)
63
0
    for (; i + 7 < n; i += 8) {
64
0
        __m256 vx = _mm256_loadu_ps(x + i);
65
0
        __m256 vy = _mm256_loadu_ps(y + i);
66
0
        __m256 vz = _mm256_add_ps(vx, vy);
67
0
        _mm256_storeu_ps(z + i, vz);
68
0
    }
69
0
#endif
70
0
    for (; i < n; ++i) {
71
0
        z[i] = x[i] + y[i];
72
0
    }
73
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
74
75
0
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
76
0
    for (int i = 0; i < n; ++i) {
77
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
78
0
    }
79
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f16
Unexecuted instantiation: vec.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
80
0
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
81
0
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_acc_f32(int, float*, float const*)
82
0
inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc1_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_acc1_f32(int, float*, float)
83
0
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
84
0
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85
0
    for (int i = 0; i < n; ++i) {
86
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87
0
    }
88
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
89
0
inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f32
Unexecuted instantiation: vec.cpp:ggml_vec_set_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f32(int, float*, float)
90
0
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_f32(int, float*, float const*)
91
0
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f32
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f32(int, float*, float const*)
92
0
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
93
0
    for (int i = 0; i < n; ++i) {
94
0
        y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
95
0
    }
96
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f16
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
97
98
0
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
99
0
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100
0
    for (int i = 0; i < n; ++i) {
101
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
102
0
    }
103
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
104
0
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f32
Unexecuted instantiation: vec.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
105
0
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
106
0
    for (int i = 0; i < n; ++i) {
107
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
108
0
    }
109
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f16
Unexecuted instantiation: vec.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
110
111
// compute GGML_VEC_DOT_UNROLL dot products at once
112
// xs - x row stride in bytes
113
0
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
114
0
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
115
0
116
0
    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
117
0
118
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
119
0
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
120
0
    }
121
0
122
0
#if defined(GGML_SIMD)
123
0
    #if defined(__ARM_FEATURE_SVE)
124
0
125
0
        const int sve_register_length = svcntb() * 8;
126
0
        const int ggml_f16_epr = sve_register_length / 16; // running when 16
127
0
        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
128
0
129
0
        const int np = (n & ~(ggml_f16_step - 1));
130
0
131
0
        svfloat16_t sum_00 = svdup_n_f16(0.0f);
132
0
        svfloat16_t sum_01 = svdup_n_f16(0.0f);
133
0
        svfloat16_t sum_02 = svdup_n_f16(0.0f);
134
0
        svfloat16_t sum_03 = svdup_n_f16(0.0f);
135
0
136
0
        svfloat16_t sum_10 = svdup_n_f16(0.0f);
137
0
        svfloat16_t sum_11 = svdup_n_f16(0.0f);
138
0
        svfloat16_t sum_12 = svdup_n_f16(0.0f);
139
0
        svfloat16_t sum_13 = svdup_n_f16(0.0f);
140
0
141
0
        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142
0
        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
0
144
0
        for (int i = 0; i < np; i += ggml_f16_step) {
145
0
            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
0
147
0
            ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
149
0
            ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
0
152
0
            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
0
154
0
            ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155
0
            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
0
            ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157
0
            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
0
159
0
            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
160
0
161
0
            ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162
0
            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
0
            ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164
0
            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
0
166
0
            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
167
0
168
0
            ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
169
0
            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170
0
            ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
171
0
            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
0
173
0
            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
174
0
175
0
            ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
176
0
177
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178
0
            ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
179
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
0
181
0
            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
182
0
183
0
            ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
184
0
185
0
            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186
0
            ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
187
0
            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
0
189
0
            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
190
0
191
0
            ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
192
0
193
0
            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194
0
            ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
195
0
            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
0
197
0
            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
198
0
199
0
            ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
200
0
201
0
            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202
0
            ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
203
0
            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204
0
        }
205
0
206
0
        const int np2 = (n & ~(ggml_f16_epr - 1));
207
0
        for (int k = np; k < np2; k += ggml_f16_epr) {
208
0
            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
209
0
210
0
            svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
211
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
212
0
            rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
213
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
214
0
        }
215
0
216
0
        if (np2 < n) {
217
0
            svbool_t pg = svwhilelt_b16(np2, n);
218
0
            svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
219
0
            svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
220
0
            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
0
222
0
            sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223
0
            sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224
0
        }
225
0
        GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
0
        GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
0
228
0
    #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229
0
        size_t vl = __riscv_vsetvlmax_e32m4();
230
0
231
0
        // initialize accumulators to all zeroes
232
0
        vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233
0
        vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234
0
        vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235
0
        vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
236
0
237
0
        // calculate step size
238
0
        const size_t epr = __riscv_vsetvlmax_e16m2();
239
0
        const size_t step = epr * 2;
240
0
        const int np = (n & ~(step - 1));
241
0
242
0
        // unroll by 2 along the row dimension
243
0
        for (int i = 0; i < np; i += step) {
244
0
            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245
0
            vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246
0
            vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247
0
            vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248
0
            vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249
0
250
0
            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251
0
            vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252
0
            vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253
0
            vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254
0
            vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
255
0
        }
256
0
257
0
        vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258
0
        vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
259
0
260
0
        // leftovers
261
0
        for (int i = np; i < n; i += vl) {
262
0
            vl = __riscv_vsetvl_e16m2(n - i);
263
0
            vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264
0
            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265
0
            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
266
0
267
0
            vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268
0
            vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
269
0
        }
270
0
271
0
        // reduce
272
0
        vl = __riscv_vsetvlmax_e32m2();
273
0
        vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274
0
                                    __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
275
0
        vl = __riscv_vsetvlmax_e32m1();
276
0
        vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277
0
        __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
278
0
        vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
279
0
                                    acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
280
0
281
0
        vl = __riscv_vsetvlmax_e32m2();
282
0
        vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283
0
                                    __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284
0
        vl = __riscv_vsetvlmax_e32m1();
285
0
        vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286
0
                                    __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
287
0
        vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
288
0
                                    acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
289
0
        sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
290
0
        sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291
0
292
0
    #else
293
0
        const int np = (n & ~(GGML_F16_STEP - 1));
294
0
295
0
        GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
296
0
297
0
        GGML_F16_VEC ax[GGML_F16_ARR];
298
0
        GGML_F16_VEC ay[GGML_F16_ARR];
299
0
300
0
        for (int i = 0; i < np; i += GGML_F16_STEP) {
301
0
            for (int j = 0; j < GGML_F16_ARR; j++) {
302
0
                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
303
0
304
0
                for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
305
0
                    ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
306
0
307
0
                    sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
308
0
                }
309
0
            }
310
0
        }
311
0
312
0
        // reduce sum0..sum3 to sum0
313
0
        for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
314
0
            GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
315
0
        }
316
0
317
0
        // leftovers
318
0
        for (int i = np; i < n; ++i) {
319
0
            for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
320
0
                sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
321
0
            }
322
0
        }
323
0
    #endif
324
0
#else
325
0
    for (int i = 0; i < n; ++i) {
326
0
        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
327
0
            sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
328
0
        }
329
0
    }
330
0
#endif
331
0
332
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
333
0
        s[i] = (float)sumf[i];
334
0
    }
335
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_dot_f16_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
Unexecuted instantiation: ops.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
336
337
0
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
338
0
#if defined(GGML_SIMD)
339
    #if defined(__ARM_FEATURE_SVE)
340
341
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
342
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
343
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
344
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
345
346
        const int np = (n & ~(ggml_f32_step - 1));
347
        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
348
        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
349
        for (int i = 0; i < np; i += ggml_f32_step) {
350
351
            ax1 = GGML_F32_VEC_LOAD(x + i);
352
            ay1 = GGML_F32_VEC_LOAD(y + i);
353
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
354
355
            GGML_F32_VEC_STORE(y + i, ay1);
356
357
            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
358
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
359
            ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
360
361
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
362
363
            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
364
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
365
            ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
366
367
            GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
368
369
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
370
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
371
            ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
372
373
            GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
374
375
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
376
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
377
            ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
378
379
            GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
380
381
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
382
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
383
            ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
384
385
            GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
386
387
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
388
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
389
            ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
390
391
            GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
392
393
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
394
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
395
            ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
396
397
            GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
398
        }
399
        // leftovers
400
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
401
        const int np2 = (n & ~(ggml_f32_epr - 1));
402
        for (int i = np; i < np2; i += ggml_f32_epr) {
403
            ax1 = GGML_F32_VEC_LOAD(x + i);
404
            ay1 = GGML_F32_VEC_LOAD(y + i);
405
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
406
407
            GGML_F32_VEC_STORE(y + i, ay1);
408
        }
409
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
410
        if (np2 < n) {
411
            svbool_t pg =svwhilelt_b32(np2, n);
412
            ax1 = svld1_f32(pg, x + np2);
413
            ay1 = svld1_f32(pg, y + np2);
414
            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
415
416
            svst1_f32(pg, y + np2, ay1);
417
        }
418
    #elif defined(__riscv_v_intrinsic)
419
        for (int i = 0, avl; i < n; i += avl) {
420
            avl = __riscv_vsetvl_e32m8(n - i);
421
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
422
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
423
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
424
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
425
        }
426
    #else
427
0
        const int np = (n & ~(GGML_F32_STEP - 1));
428
429
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
430
431
0
        GGML_F32_VEC ax[GGML_F32_ARR];
432
0
        GGML_F32_VEC ay[GGML_F32_ARR];
433
434
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
435
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
436
0
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
437
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
438
0
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
439
440
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
441
0
            }
442
0
        }
443
444
        // leftovers
445
0
        for (int i = np; i < n; ++i) {
446
0
            y[i] += x[i]*v;
447
0
        }
448
0
    #endif
449
#else
450
    // scalar
451
    for (int i = 0; i < n; ++i) {
452
        y[i] += x[i]*v;
453
    }
454
#endif
455
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
456
457
0
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
458
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
459
    const int sve_register_length = svcntb() * 8;
460
    const int ggml_f16_epr = sve_register_length / 16;
461
    const int ggml_f16_step = 8 * ggml_f16_epr;
462
463
    GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
464
465
    int np = (n & ~(ggml_f16_step - 1));
466
467
    svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
468
    svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
469
    for (int i = 0; i < np; i += ggml_f16_step) {
470
        ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
471
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
472
        ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
473
474
        GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
475
476
        ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
477
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
478
        ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
479
480
        GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
481
482
        ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
483
        ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
484
        ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
485
486
        GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
487
488
        ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
489
        ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
490
        ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
491
492
        GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
493
494
        ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
495
        ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
496
        ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
497
498
        GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
499
500
        ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
501
        ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
502
        ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
503
504
        GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
505
506
        ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
507
        ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
508
        ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
509
510
        GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
511
512
        ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
513
        ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
514
        ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
515
516
        GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
517
    }
518
    const int np2 = (n & ~(ggml_f16_epr - 1));
519
    for (int k = np; k < np2; k += ggml_f16_epr) {
520
        svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
521
        svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
522
        ry = GGML_F16x_VEC_FMA(ry, rx, vx);
523
524
        GGML_F16x_VEC_STORE(y + k, ry, 0);
525
    }
526
527
    if (np2 < n) {
528
        svbool_t pg = svwhilelt_b16(np2, n);
529
        svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
530
        svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
531
        hy = svmad_f16_x(pg, hx, vx, hy);
532
        svst1_f16(pg, (__fp16 *)(y + np2), hy);
533
    }
534
    np = n;
535
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
536
    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
537
    const _Float16 scale = *(const _Float16*)(&s);
538
539
    // calculate step size
540
    const int epr = __riscv_vsetvlmax_e16m4();
541
    const int step = epr * 2;
542
    int np = (n & ~(step - 1));
543
544
    // unroll by 2
545
    for (int i = 0; i < np; i += step) {
546
        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
547
        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
548
        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
549
        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
550
        __asm__ __volatile__ ("" ::: "memory");
551
552
        vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
553
        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
554
        ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
555
        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
556
        __asm__ __volatile__ ("" ::: "memory");
557
    }
558
559
    // leftovers
560
    int vl;
561
    for (int i = np; i < n; i += vl) {
562
        vl = __riscv_vsetvl_e16m4(n - i);
563
        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
564
        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
565
        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
566
        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
567
    }
568
    np = n;
569
#elif defined(GGML_SIMD)
570
0
    const int np = (n & ~(GGML_F16_STEP - 1));
571
572
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
573
574
0
    GGML_F16_VEC ax[GGML_F16_ARR];
575
0
    GGML_F16_VEC ay[GGML_F16_ARR];
576
577
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
578
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
579
0
            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
580
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
581
0
            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
582
583
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
584
0
        }
585
0
    }
586
#else
587
    const int np = 0;
588
#endif
589
590
    // leftovers
591
0
    for (int i = np; i < n; ++i) {
592
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
593
0
    }
594
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
595
596
// xs and vs are byte strides of x and v
597
0
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
598
599
0
    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
600
0
    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
601
602
0
    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
603
0
        x[i] = (const float *) ((const char *) xv + i*xs);
604
0
        v[i] = (const float *) ((const char *) vv + i*vs);
605
0
    }
606
607
0
#if defined(GGML_SIMD)
608
    #if defined(__ARM_FEATURE_SVE)
609
        // scalar Route to scalar implementation       //TODO: Write SVE code
610
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
611
            for (int i = 0; i < n; ++i) {
612
                y[i] += x[k][i]*v[k][0];
613
            }
614
        }
615
    #elif defined(__riscv_v_intrinsic)
616
        for (int i = 0, avl; i < n; i += avl) {
617
            avl = __riscv_vsetvl_e32m8(n - i);
618
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
619
            for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
620
                vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
621
                ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
622
            }
623
            __riscv_vse32_v_f32m8(&y[i], ay, avl);
624
        }
625
    #else
626
0
        const int np = (n & ~(GGML_F32_STEP - 1));
627
628
0
        GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
629
630
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
631
0
            vx[k] = GGML_F32_VEC_SET1(v[k][0]);
632
0
        }
633
634
0
        GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
635
0
        GGML_F32_VEC ay[GGML_F32_ARR];
636
637
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
638
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
639
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
640
641
0
                for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
642
0
                    ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
643
0
                    ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
644
0
                }
645
646
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
647
0
            }
648
0
        }
649
650
        // leftovers
651
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
652
0
            for (int i = np; i < n; ++i) {
653
0
                y[i] += x[k][i]*v[k][0];
654
0
            }
655
0
        }
656
0
    #endif
657
#else
658
    // scalar
659
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
660
        for (int i = 0; i < n; ++i) {
661
            y[i] += x[k][i]*v[k][0];
662
        }
663
    }
664
#endif
665
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
666
667
0
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
668
#if defined(GGML_USE_ACCELERATE)
669
    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
670
#elif defined(GGML_SIMD)
671
    #if defined(__ARM_FEATURE_SVE)
672
        // scalar ; TODO: Write SVE code
673
        for (int i = 0; i < n; ++i) {
674
            y[i] = x[i]*s + b;
675
        }
676
    #elif defined(__riscv_v_intrinsic)
677
        for (int i = 0, avl; i < n; i += avl) {
678
            avl = __riscv_vsetvl_e32m8(n - i);
679
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
680
            vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
681
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
682
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
683
        }
684
    #else
685
0
        const int np = (n & ~(GGML_F32_STEP - 1));
686
687
0
        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
688
0
        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
689
690
0
        GGML_F32_VEC ay[GGML_F32_ARR];
691
692
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
693
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
694
0
                ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
695
0
                ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
696
697
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
698
0
            }
699
0
        }
700
701
        // leftovers
702
0
        for (int i = np; i < n; ++i) {
703
0
            y[i] = x[i]*s + b;
704
0
        }
705
0
    #endif
706
#else
707
    // scalar
708
    for (int i = 0; i < n; ++i) {
709
        y[i] = x[i]*s + b;
710
    }
711
#endif
712
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
713
714
//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
715
0
inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
716
#if defined(GGML_USE_ACCELERATE)
717
    vDSP_vsmul(y, 1, &v, y, 1, n);
718
#elif defined(GGML_SIMD)
719
    #if defined(__ARM_FEATURE_SVE)
720
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
721
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
722
        const int ggml_f32_step = 2 * ggml_f32_epr;
723
724
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
725
        const int np = (n & ~(ggml_f32_step - 1));
726
        svfloat32_t ay1;
727
        svfloat32_t ay2;
728
        for (int i = 0; i < np; i += ggml_f32_step) {
729
            ay1 = GGML_F32_VEC_LOAD(y + i);
730
            ay1 = GGML_F32_VEC_MUL(ay1, vx);
731
            GGML_F32_VEC_STORE(y + i, ay1);
732
733
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
734
            ay2 = GGML_F32_VEC_MUL(ay2, vx);
735
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
736
        }
737
        // leftovers
738
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
739
        for (int i = np; i < n; i += ggml_f32_epr) {
740
            svbool_t pg = svwhilelt_b32(i, n);
741
            ay1 = svld1_f32(pg, y + i);
742
            ay1 = svmul_f32_m(pg, ay1, vx);
743
            svst1_f32(pg, y + i, ay1);
744
        }
745
    #elif defined(__riscv_v_intrinsic)
746
        for (int i = 0, avl; i < n; i += avl) {
747
            avl = __riscv_vsetvl_e32m8(n - i);
748
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
749
            vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
750
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
751
        }
752
    #else
753
0
        const int np = (n & ~(GGML_F32_STEP - 1));
754
755
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
756
757
0
        GGML_F32_VEC ay[GGML_F32_ARR];
758
759
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
760
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
761
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
762
0
                ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
763
764
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
765
0
            }
766
0
        }
767
768
        // leftovers
769
0
        for (int i = np; i < n; ++i) {
770
0
            y[i] *= v;
771
0
        }
772
0
    #endif
773
#else
774
    // scalar
775
    for (int i = 0; i < n; ++i) {
776
        y[i] *= v;
777
    }
778
#endif
779
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f32
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f32(int, float*, float)
780
781
0
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
782
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
783
    const int sve_register_length = svcntb() * 8;
784
    const int ggml_f16_epr = sve_register_length / 16;
785
    const int ggml_f16_step = 2 * ggml_f16_epr;
786
787
    GGML_F16x_VEC vx =  GGML_F16x_VEC_SET1(v);
788
    const int np = (n & ~(ggml_f16_step - 1));
789
    svfloat16_t ay1, ay2;
790
791
    for (int i = 0; i < np; i += ggml_f16_step) {
792
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
793
        ay1 = GGML_F16x_VEC_MUL(ay1, vx);
794
        GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
795
796
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
797
        ay2 = GGML_F16x_VEC_MUL(ay2, vx);
798
        GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
799
    }
800
    // leftovers
801
    // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
802
    if (np < n) {
803
        svbool_t pg = svwhilelt_b16(np, n);
804
        svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
805
        svfloat16_t out = svmul_f16_m(pg, hy, vx);
806
        svst1_f16(pg, (__fp16 *)(y + np), out);
807
    }
808
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
809
    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
810
    const _Float16 scale = *(const _Float16*)(&s);
811
812
    // calculate step size
813
    const int epr = __riscv_vsetvlmax_e16m4();
814
    const int step = epr * 2;
815
    const int np = (n & ~(step - 1));
816
817
    // unroll by 2
818
    for (int i = 0; i < np; i += step) {
819
        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
820
        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
821
        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
822
        __asm__ __volatile__ ("" ::: "memory");
823
824
        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
825
        ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
826
        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
827
        __asm__ __volatile__ ("" ::: "memory");
828
    }
829
830
    // leftovers
831
    int vl;
832
    for (int i = np; i < n; i += vl) {
833
        vl = __riscv_vsetvl_e16m4(n - i);
834
        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
835
        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
836
        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
837
    }
838
#elif defined(GGML_SIMD)
839
0
    const int np = (n & ~(GGML_F16_STEP - 1));
840
841
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
842
843
0
    GGML_F16_VEC ay[GGML_F16_ARR];
844
845
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
846
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
847
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
848
0
            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
849
850
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
851
0
        }
852
0
    }
853
854
    // leftovers
855
0
    for (int i = np; i < n; ++i) {
856
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
857
0
    }
858
#else
859
    // scalar
860
    for (int i = 0; i < n; ++i) {
861
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
862
    }
863
#endif
864
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f16
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
865
866
0
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_f32(int, float*, float const*)
867
0
inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f32(int, float*, float const*)
868
0
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
869
0
    for (int i = 0; i < n; ++i) {
870
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
871
0
        y[i] = GGML_CPU_FP32_TO_FP16(v*v);
872
0
    }
873
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
874
0
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
875
0
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
876
0
    for (int i = 0; i < n; ++i) {
877
0
        y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
878
0
    }
879
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
880
0
inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f32
Unexecuted instantiation: vec.cpp:ggml_vec_log_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f32(int, float*, float const*)
881
0
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
882
0
    for (int i = 0; i < n; ++i) {
883
0
        y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
884
0
    }
885
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f16
Unexecuted instantiation: vec.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
886
0
inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f32(int, float*, float const*)
887
0
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
888
0
    for (int i = 0; i < n; ++i) {
889
0
        y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
890
0
    }
891
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
892
0
inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f32(int, float*, float const*)
893
0
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
894
0
    for (int i = 0; i < n; ++i) {
895
0
        y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
896
0
    }
897
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f16
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
898
0
inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f32
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f32(int, float*, float const*)
899
0
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
900
0
    for (int i = 0; i < n; ++i) {
901
0
        y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
902
0
    }
903
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f16
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
904
0
inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f32(int, float*, float const*)
905
0
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
906
0
    for (int i = 0; i < n; ++i) {
907
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
908
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
909
0
    }
910
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
911
0
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f32
Unexecuted instantiation: vec.cpp:ggml_vec_step_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f32(int, float*, float const*)
912
0
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
913
0
    for (int i = 0; i < n; ++i) {
914
0
        y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
915
0
    }
916
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f16
Unexecuted instantiation: vec.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
917
0
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f32
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f32(int, float*, float const*)
918
0
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
919
0
    for (int i = 0; i < n; ++i) {
920
0
        y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
921
0
    }
922
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f16
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
923
0
inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f32(int, float*, float const*)
924
0
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
925
0
    for (int i = 0; i < n; ++i) {
926
0
        const float v = GGML_CPU_FP16_TO_FP32(x[i]);
927
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
928
0
    }
929
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
930
0
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f32(int, float*, float const*)
931
0
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
932
0
    for (int i = 0; i < n; ++i) {
933
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
934
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
935
0
    }
936
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
937
0
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
938
0
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
939
0
    for (int i = 0; i < n; ++i) {
940
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
941
0
        y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
942
0
    }
943
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
944
0
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
945
0
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
946
0
    for (int i = 0; i < n; ++i) {
947
0
        y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
948
0
    }
949
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
950
// TODO: optimize performance
951
0
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
952
0
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
953
0
    for (int i = 0; i < n; ++i) {
954
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
955
0
        y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
956
0
    }
957
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
958
0
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
959
0
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
960
0
    for (int i = 0; i < n; ++i) {
961
0
        y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
962
0
    }
963
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
964
0
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f32
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f32(int, float*, float const*)
965
0
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
966
0
    for (int i = 0; i < n; ++i) {
967
0
        y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
968
0
    }
969
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f16
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
970
971
static const float GELU_COEF_A     = 0.044715f;
972
static const float GELU_QUICK_COEF = -1.702f;
973
static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
974
static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;
975
976
196k
inline static float ggml_gelu_f32(float x) {
977
196k
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
978
196k
}
ggml-cpu.c:ggml_gelu_f32
Line
Count
Source
976
196k
inline static float ggml_gelu_f32(float x) {
977
196k
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
978
196k
}
Unexecuted instantiation: vec.cpp:ggml_gelu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_f32(float)
979
980
0
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
981
0
    const uint16_t * i16 = (const uint16_t *) x;
982
0
    for (int i = 0; i < n; ++i) {
983
0
        y[i] = ggml_table_gelu_f16[i16[i]];
984
0
    }
985
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
986
987
0
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
988
0
    for (int i = 0; i < n; ++i) {
989
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
990
0
        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
991
0
        y[i] = GGML_CPU_FP32_TO_FP16(res);
992
0
    }
993
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
994
995
#ifdef GGML_GELU_FP16
996
0
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
997
0
    uint16_t t;
998
0
    for (int i = 0; i < n; ++i) {
999
0
        if (x[i] <= -10.0f) {
1000
0
            y[i] = 0.0f;
1001
0
        } else if (x[i] >= 10.0f) {
1002
0
            y[i] = x[i];
1003
0
        } else {
1004
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1005
0
            memcpy(&t, &fp16, sizeof(uint16_t));
1006
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1007
0
        }
1008
0
    }
1009
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f32(int, float*, float const*)
1010
#else
1011
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1012
    for (int i = 0; i < n; ++i) {
1013
        y[i] = ggml_gelu_f32(x[i]);
1014
    }
1015
}
1016
#endif
1017
1018
0
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
1019
0
    for (int i = 0; i < n; ++i) {
1020
0
        float xi = x[i];
1021
0
        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
1022
0
    }
1023
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
1024
1025
196k
inline static float ggml_gelu_quick_f32(float x) {
1026
196k
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
1027
196k
}
ggml-cpu.c:ggml_gelu_quick_f32
Line
Count
Source
1025
196k
inline static float ggml_gelu_quick_f32(float x) {
1026
196k
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
1027
196k
}
Unexecuted instantiation: vec.cpp:ggml_gelu_quick_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_quick_f32(float)
1028
1029
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1030
//    const uint16_t * i16 = (const uint16_t *) x;
1031
//    for (int i = 0; i < n; ++i) {
1032
//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
1033
//    }
1034
//}
1035
1036
#ifdef GGML_GELU_QUICK_FP16
1037
0
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1038
0
    uint16_t t;
1039
0
    for (int i = 0; i < n; ++i) {
1040
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1041
0
        memcpy(&t, &fp16, sizeof(uint16_t));
1042
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
1043
0
    }
1044
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
1045
#else
1046
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1047
    for (int i = 0; i < n; ++i) {
1048
        y[i] = ggml_gelu_quick_f32(x[i]);
1049
    }
1050
}
1051
#endif
1052
1053
0
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1054
0
    for (int i = 0; i < n; ++i) {
1055
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
1056
0
        y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
1057
0
    }
1058
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
1059
1060
// Sigmoid Linear Unit (SiLU) function
1061
0
inline static float ggml_silu_f32(float x) {
1062
0
    return x/(1.0f + expf(-x));
1063
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f32
Unexecuted instantiation: vec.cpp:ggml_silu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_silu_f32(float)
1064
0
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
1065
0
    float v = GGML_CPU_FP16_TO_FP32(x);
1066
0
    return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
1067
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f16
Unexecuted instantiation: vec.cpp:ggml_silu_f16(unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_f16(unsigned short)
1068
1069
#if __FINITE_MATH_ONLY__
1070
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
1071
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
1072
#endif
1073
1074
/* Below function was borrowed from the GitHub repository:
1075
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
1076
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1077
    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
1078
        // Constants
1079
        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
1080
        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
1081
        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
1082
        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
1083
        const svfloat32_t one = svdup_n_f32(1.0f);
1084
        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
1085
        const svint32_t inactive2 = svdup_n_s32(0);
1086
1087
        // Algorithm starts here
1088
        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
1089
        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
1090
        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
1091
1092
        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
1093
        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
1094
1095
        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
1096
        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
1097
        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
1098
1099
        // and_(t2.d, t1.d, not_mask17.d)
1100
        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
1101
        t5 = svsub_f32_m(pg, t1, t5);                // z
1102
        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
1103
        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
1104
        t0 = svmul_f32_m(pg, t0, t4);                // Final result
1105
1106
        return t0;
1107
    }
1108
#endif
1109
1110
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1111
1112
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1113
    const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1114
    const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1115
    const svfloat32_t n = svsub_f32_x(pg, z, r);
1116
    const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1117
    const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1118
    const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1119
    const svbool_t c = svacgt_n_f32(pg, n, 126);
1120
    const svfloat32_t u = svmul_f32_x(pg, b, b);
1121
    const svfloat32_t j = svmla_f32_x(pg,
1122
        svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1123
        svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1124
                        svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1125
    const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1126
    const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1127
    const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1128
    return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1129
                     svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1130
}
1131
1132
// computes silu x/(1+exp(-x)) in single precision vector
1133
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1134
    const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1135
    const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1136
    const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1137
    const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1138
    const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1139
    return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1140
}
1141
1142
#elif defined(__ARM_NEON) && defined(__aarch64__)
1143
1144
// adapted from arm limited optimized routine
1145
// the maximum error is 1.45358 plus 0.5 ulps
1146
// numbers above 88.38 will flush to infinity
1147
// numbers beneath -103.97 will flush to zero
1148
inline static float32x4_t ggml_v_expf(float32x4_t x) {
1149
    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
1150
    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
1151
    const float32x4_t n = vsubq_f32(z, r);
1152
    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
1153
                                    vdupq_n_f32(0x1.7f7d1cp-20f));
1154
    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
1155
    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
1156
    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
1157
    const float32x4_t u = vmulq_f32(b, b);
1158
    const float32x4_t j = vfmaq_f32(
1159
        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
1160
        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
1161
                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
1162
    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1163
        return vfmaq_f32(k, j, k);
1164
    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
1165
    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
1166
    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1167
    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
1168
                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1169
}
1170
1171
// computes silu x/(1+exp(-x)) in single precision vector
1172
inline static float32x4_t ggml_v_silu(float32x4_t x) {
1173
    const float32x4_t one = vdupq_n_f32(1.0f);
1174
    const float32x4_t zero = vdupq_n_f32(0.0f);
1175
    const float32x4_t neg_x = vsubq_f32(zero, x);
1176
    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1177
    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1178
    return vdivq_f32(x, one_plus_exp_neg_x);
1179
}
1180
1181
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
1182
1183
// adapted from arm limited optimized routine
1184
// the maximum error is 1.45358 plus 0.5 ulps
1185
// numbers above 88.38 will flush to infinity
1186
// numbers beneath -103.97 will flush to zero
1187
inline static __m512 ggml_v_expf(__m512 x) {
1188
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
1189
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
1190
  const __m512 n = _mm512_sub_ps(z, r);
1191
  const __m512 b =
1192
      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
1193
                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
1194
  const __mmask16 d =
1195
      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
1196
  const __m512 u = _mm512_mul_ps(b, b);
1197
  const __m512 j = _mm512_fmadd_ps(
1198
      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
1199
                                      _mm512_set1_ps(0x1.573e2ep-5f)),
1200
                      u,
1201
                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
1202
                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
1203
      u,
1204
      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
1205
  const __m512 res = _mm512_scalef_ps(j, n);
1206
  if (_mm512_kortestz(d, d))
1207
    return res;
1208
  const __m512 zero = _mm512_setzero_ps();
1209
  const __m512 alt = _mm512_mask_blend_ps(
1210
      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1211
  return _mm512_mask_blend_ps(d, res, alt);
1212
}
1213
1214
// computes silu x/(1+exp(-x)) in single precision vector
1215
inline static __m512 ggml_v_silu(__m512 x) {
1216
    const __m512 one = _mm512_set1_ps(1);
1217
    const __m512 zero = _mm512_setzero_ps();
1218
    const __m512 neg_x = _mm512_sub_ps(zero, x);
1219
    const __m512 exp_neg_x = ggml_v_expf(neg_x);
1220
    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1221
    return _mm512_div_ps(x, one_plus_exp_neg_x);
1222
}
1223
1224
#elif defined(__AVX2__) && defined(__FMA__)
1225
1226
// adapted from arm limited optimized routine
1227
// the maximum error is 1.45358 plus 0.5 ulps
1228
// numbers above 88.38 will flush to infinity
1229
// numbers beneath -103.97 will flush to zero
1230
0
inline static __m256 ggml_v_expf(__m256 x) {
1231
0
  const __m256 r = _mm256_set1_ps(0x1.8p23f);
1232
0
  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
1233
0
  const __m256 n = _mm256_sub_ps(z, r);
1234
0
  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
1235
0
                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
1236
0
  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
1237
0
  const __m256 k = _mm256_castsi256_ps(
1238
0
      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
1239
0
  const __m256i c = _mm256_castps_si256(
1240
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1241
0
                    _mm256_set1_ps(126), _CMP_GT_OQ));
1242
0
  const __m256 u = _mm256_mul_ps(b, b);
1243
0
  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
1244
0
                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
1245
0
                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
1246
0
                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
1247
0
                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
1248
0
  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
1249
0
    return _mm256_fmadd_ps(j, k, k);
1250
0
  const __m256i g = _mm256_and_si256(
1251
0
      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1252
0
      _mm256_set1_epi32(0x82000000u));
1253
0
  const __m256 s1 =
1254
0
      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
1255
0
  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
1256
0
  const __m256i d = _mm256_castps_si256(
1257
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1258
0
                    _mm256_set1_ps(192), _CMP_GT_OQ));
1259
0
  return _mm256_or_ps(
1260
0
      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
1261
0
      _mm256_andnot_ps(
1262
0
          _mm256_castsi256_ps(d),
1263
0
          _mm256_or_ps(
1264
0
              _mm256_and_ps(_mm256_castsi256_ps(c),
1265
0
                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
1266
0
              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
1267
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_expf
Unexecuted instantiation: vec.cpp:ggml_v_expf(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_expf(float __vector(8))
1268
1269
// computes silu x/(1+exp(-x)) in single precision vector
1270
0
inline static __m256 ggml_v_silu(__m256 x) {
1271
0
    const __m256 one = _mm256_set1_ps(1);
1272
0
    const __m256 zero = _mm256_setzero_ps();
1273
0
    const __m256 neg_x = _mm256_sub_ps(zero, x);
1274
0
    const __m256 exp_neg_x = ggml_v_expf(neg_x);
1275
0
    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
1276
0
    return _mm256_div_ps(x, one_plus_exp_neg_x);
1277
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_silu
Unexecuted instantiation: vec.cpp:ggml_v_silu(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_silu(float __vector(8))
1278
1279
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1280
1281
#if defined(__FMA__)
1282
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1283
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1284
#else
1285
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1286
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1287
#endif
1288
1289
// adapted from arm limited optimized routine
1290
// the maximum error is 1.45358 plus 0.5 ulps
1291
// numbers above 88.38 will flush to infinity
1292
// numbers beneath -103.97 will flush to zero
1293
inline static __m128 ggml_v_expf(__m128 x) {
1294
    const __m128 r = _mm_set1_ps(0x1.8p23f);
1295
    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
1296
    const __m128 n = _mm_sub_ps(z, r);
1297
    const __m128 b =
1298
        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
1299
    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
1300
    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
1301
    const __m128i c =
1302
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
1303
    const __m128 u = _mm_mul_ps(b, b);
1304
    const __m128 j =
1305
        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
1306
                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
1307
                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
1308
    if (!_mm_movemask_epi8(c))
1309
        return MADD128(j, k, k);
1310
    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1311
                                    _mm_set1_epi32(0x82000000u));
1312
    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
1313
    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1314
    const __m128i d =
1315
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
1316
    return _mm_or_ps(
1317
        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1318
        _mm_andnot_ps(_mm_castsi128_ps(d),
1319
                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1320
                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1321
}
1322
1323
// computes silu x/(1+exp(-x)) in single precision vector
1324
inline static __m128 ggml_v_silu(__m128 x) {
1325
    const __m128 one = _mm_set1_ps(1);
1326
    const __m128 zero = _mm_setzero_ps();
1327
    const __m128 neg_x = _mm_sub_ps(zero, x);
1328
    const __m128 exp_neg_x = ggml_v_expf(neg_x);
1329
    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1330
    return _mm_div_ps(x, one_plus_exp_neg_x);
1331
}
1332
1333
#elif defined(__riscv_v_intrinsic)
1334
1335
// adapted from arm limited optimized routine
1336
// the maximum error is 1.45358 plus 0.5 ulps
1337
// numbers above 88.38 will flush to infinity
1338
// numbers beneath -103.97 will flush to zero
1339
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1340
    const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1341
#ifdef __riscv_xtheadvector
1342
    // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1343
    vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1344
    z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1345
#else
1346
    const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1347
#endif
1348
    const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1349
    const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1350
                                                    0x1.7f7d1cp-20f, n, vl);
1351
    const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1352
    const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1353
    const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1354
    const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1355
    const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1356
        __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1357
        __riscv_vfmacc_vv_f32m2(
1358
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1359
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1360
            u, vl), u, vl);
1361
    if (!__riscv_vcpop_m_b16(c, vl))
1362
        return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1363
    const vbool16_t  dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1364
    const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1365
    const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1366
    const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1367
    const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1368
        __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1369
        __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1370
        c, vl);
1371
    return __riscv_vmerge_vvm_f32m2(
1372
        r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1373
        __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1374
        vl);
1375
}
1376
1377
// computes silu x/(1+exp(-x)) in single precision vector
1378
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1379
    const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1380
    const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1381
    const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1382
    return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1383
}
1384
1385
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1386
1387
0
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1388
0
    for (int i = 0; i < n; ++i) {
1389
0
        y[i] = ggml_silu_f16(x[i]);
1390
0
    }
1391
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
1392
1393
0
inline static float ggml_silu_backward_f32(float x, float dy) {
1394
0
    const float s = 1.0f/(1.0f + expf(-x));
1395
0
    return dy*s*(1.0f + x*(1.0f - s));
1396
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f32(float, float)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f32(float, float)
1397
1398
0
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1399
0
    const float v = GGML_CPU_FP16_TO_FP32(x);
1400
0
    const float s = 1.0f/(1.0f + expf(-v));
1401
0
    return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1402
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
1403
1404
0
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1405
0
    for (int i = 0; i < n; ++i) {
1406
0
        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
1407
0
    }
1408
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
1409
1410
0
inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1411
0
    for (int i = 0; i < n; ++i) {
1412
0
        dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
1413
0
    }
1414
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1415
1416
0
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1417
0
    for (int i = 0; i < n; ++i) {
1418
0
        y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1419
0
    }
1420
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
1421
1422
0
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1423
0
    for (int i = 0; i < n; ++i) {
1424
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
1425
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1426
0
    }
1427
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1428
1429
#ifdef GGML_GELU_FP16
1430
0
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1431
0
    uint16_t t;
1432
0
    for (int i = 0; i < n; ++i) {
1433
0
        if (x[i] <= -10.0f) {
1434
0
            y[i] = 0.0f;
1435
0
        } else if (x[i] >= 10.0f) {
1436
0
            y[i] = x[i] * g[i];
1437
0
        } else {
1438
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1439
0
            memcpy(&t, &fp16, sizeof(uint16_t));
1440
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1441
0
        }
1442
0
    }
1443
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
1444
#else
1445
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1446
    for (int i = 0; i < n; ++i) {
1447
        y[i] = ggml_gelu_f32(x[i]) * g[i];
1448
    }
1449
}
1450
#endif
1451
1452
0
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1453
0
    const uint16_t * i16 = (const uint16_t *) x;
1454
0
    for (int i = 0; i < n; ++i) {
1455
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1456
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1457
0
    }
1458
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1459
1460
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1461
1462
0
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1463
0
    for (int i = 0; i < n; ++i) {
1464
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1465
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1466
0
        y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1467
0
    }
1468
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_swiglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1469
1470
0
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1471
0
    for (int i = 0; i < n; ++i) {
1472
0
        float xi = x[i];
1473
0
        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1474
0
    }
1475
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
1476
1477
0
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1478
0
    for (int i = 0; i < n; ++i) {
1479
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1480
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1481
0
        y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1482
0
    }
1483
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1484
1485
#ifdef GGML_GELU_QUICK_FP16
1486
0
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1487
0
    uint16_t t;
1488
0
    for (int i = 0; i < n; ++i) {
1489
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1490
0
        memcpy(&t, &fp16, sizeof(uint16_t));
1491
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1492
0
    }
1493
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
1494
#else
1495
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1496
    for (int i = 0; i < n; ++i) {
1497
        y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1498
    }
1499
}
1500
#endif
1501
1502
0
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1503
0
    const uint16_t * i16 = (const uint16_t *) x;
1504
0
    for (int i = 0; i < n; ++i) {
1505
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1506
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1507
0
    }
1508
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1509
1510
0
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1511
0
#ifndef GGML_USE_ACCELERATE
1512
0
    ggml_float sum = 0.0;
1513
0
    for (int i = 0; i < n; ++i) {
1514
0
        sum += (ggml_float)x[i];
1515
0
    }
1516
0
    *s = (float)sum;
1517
#else
1518
    vDSP_sve(x, 1, s, n);
1519
#endif
1520
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32(int, float*, float const*)
1521
1522
0
inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
1523
0
    for (int i = 0; i < n; ++i) {
1524
0
        if (i == 0) {
1525
0
            y[i] = x[i];
1526
0
        } else {
1527
0
            y[i] = y[i - 1] + x[i];
1528
0
        }
1529
0
    }
1530
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cumsum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
1531
1532
0
inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1533
0
    ggml_float sum = 0.0;
1534
0
    for (int i = 0; i < n; ++i) {
1535
0
        sum += (ggml_float)x[i];
1536
0
    }
1537
0
    *s = sum;
1538
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
1539
1540
0
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1541
0
    float sum = 0.0f;
1542
0
    for (int i = 0; i < n; ++i) {
1543
0
        sum += GGML_CPU_FP16_TO_FP32(x[i]);
1544
0
    }
1545
0
    *s = sum;
1546
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
1547
1548
0
inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1549
0
    float sum = 0.0f;
1550
0
    for (int i = 0; i < n; ++i) {
1551
0
        sum += GGML_BF16_TO_FP32(x[i]);
1552
0
    }
1553
0
    *s = sum;
1554
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_bf16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
1555
1556
0
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1557
0
#ifndef GGML_USE_ACCELERATE
1558
0
    float max = -INFINITY;
1559
0
    for (int i = 0; i < n; ++i) {
1560
0
        max = MAX(max, x[i]);
1561
0
    }
1562
0
    *s = max;
1563
#else
1564
    vDSP_maxv(x, 1, s, n);
1565
#endif
1566
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_max_f32
Unexecuted instantiation: vec.cpp:ggml_vec_max_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_max_f32(int, float*, float const*)
1567
1568
0
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1569
0
    ggml_vec_norm_f32(n, s, x);
1570
0
    *s = 1.f/(*s);
1571
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_inv_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
1572
1573
0
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1574
0
    float max = -INFINITY;
1575
0
    int idx = 0;
1576
0
    for (int i = 0; i < n; ++i) {
1577
0
        max = MAX(max, x[i]);
1578
0
        if (max == x[i]) { idx = i; }
1579
0
    }
1580
0
    *s = idx;
1581
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_argmax_f32
Unexecuted instantiation: vec.cpp:ggml_vec_argmax_f32(int, int*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_argmax_f32(int, int*, float const*)
1582
1583
#ifdef __cplusplus
1584
}
1585
#endif