Coverage Report

Created: 2025-12-14 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/ggml/src/ggml-cpu/vec.h
Line
Count
Source
1
// Vectorized functions for fundamental operations
2
3
#pragma once
4
5
#include "ggml-impl.h"
6
#include "simd-mappings.h"
7
#include "ggml.h"
8
#include "ggml-cpu.h"
9
10
#if defined(GGML_USE_ACCELERATE)
11
#include <Accelerate/Accelerate.h>
12
#endif
13
14
// floating point type used to accumulate sums
15
typedef double ggml_float;
16
17
#define GGML_GELU_FP16
18
#define GGML_GELU_QUICK_FP16
19
20
0
#define GGML_SOFT_MAX_UNROLL 4
21
#define GGML_VEC_DOT_UNROLL  2
22
0
#define GGML_VEC_MAD_UNROLL  32
23
24
#ifdef __cplusplus
25
extern "C" {
26
#endif
27
28
//
29
// global data
30
//
31
32
// precomputed gelu table for f16 (128 KB)
33
extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
34
35
// precomputed quick gelu table for f16 (128 KB)
36
extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
37
38
//
39
// fundamental operations
40
//
41
42
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
43
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
44
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
46
void ggml_vec_silu_f32(const int n, float * y, const float * x);
47
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
48
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
49
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
50
51
0
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i8
Unexecuted instantiation: vec.cpp:ggml_vec_set_i8(int, signed char*, signed char)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i8(int, signed char*, signed char)
52
0
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i16
Unexecuted instantiation: vec.cpp:ggml_vec_set_i16(int, short*, short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i16(int, short*, short)
53
54
0
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i32
Unexecuted instantiation: vec.cpp:ggml_vec_set_i32(int, int*, int)
Unexecuted instantiation: ops.cpp:ggml_vec_set_i32(int, int*, int)
55
0
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_i32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_i32(int, int*, int const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_i32(int, int*, int const*)
56
57
0
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f16
Unexecuted instantiation: vec.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short)
58
0
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_bf16
Unexecuted instantiation: vec.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
Unexecuted instantiation: ops.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t)
59
60
0
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
61
0
    int i = 0;
62
0
#if defined(__AVX2__)
63
0
    for (; i + 7 < n; i += 8) {
64
0
        __m256 vx = _mm256_loadu_ps(x + i);
65
0
        __m256 vy = _mm256_loadu_ps(y + i);
66
0
        __m256 vz = _mm256_add_ps(vx, vy);
67
0
        _mm256_storeu_ps(z + i, vz);
68
0
    }
69
0
#endif
70
0
    for (; i < n; ++i) {
71
0
        z[i] = x[i] + y[i];
72
0
    }
73
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f32(int, float*, float const*, float const*)
74
75
0
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
76
0
    for (int i = 0; i < n; ++i) {
77
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
78
0
    }
79
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f16
Unexecuted instantiation: vec.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
80
0
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_add1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_add1_f32(int, float*, float const*, float)
81
0
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_acc_f32(int, float*, float const*)
82
0
inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_acc1_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_acc1_f32(int, float*, float)
83
0
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*)
84
0
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85
0
    for (int i = 0; i < n; ++i) {
86
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87
0
    }
88
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
89
0
inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f32
Unexecuted instantiation: vec.cpp:ggml_vec_set_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_set_f32(int, float*, float)
90
0
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cpy_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cpy_f32(int, float*, float const*)
91
0
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f32
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f32(int, float*, float const*)
92
0
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
93
0
    for (int i = 0; i < n; ++i) {
94
0
        y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
95
0
    }
96
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f16
Unexecuted instantiation: vec.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*)
97
98
0
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*)
99
0
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100
0
    for (int i = 0; i < n; ++i) {
101
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
102
0
    }
103
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
104
0
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f32
Unexecuted instantiation: vec.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f32(int, float*, float const*, float const*)
105
0
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
106
0
    for (int i = 0; i < n; ++i) {
107
0
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
108
0
    }
109
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f16
Unexecuted instantiation: vec.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
110
111
// compute GGML_VEC_DOT_UNROLL dot products at once
112
// xs - x row stride in bytes
113
0
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
114
0
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
115
0
116
0
    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
117
0
118
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
119
0
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
120
0
    }
121
0
122
0
#if defined(GGML_SIMD)
123
0
    #if defined(__ARM_FEATURE_SVE)
124
0
125
0
        const int sve_register_length = svcntb() * 8;
126
0
        const int ggml_f16_epr = sve_register_length / 16; // running when 16
127
0
        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
128
0
129
0
        const int np = (n & ~(ggml_f16_step - 1));
130
0
131
0
        svfloat16_t sum_00 = svdup_n_f16(0.0f);
132
0
        svfloat16_t sum_01 = svdup_n_f16(0.0f);
133
0
        svfloat16_t sum_02 = svdup_n_f16(0.0f);
134
0
        svfloat16_t sum_03 = svdup_n_f16(0.0f);
135
0
136
0
        svfloat16_t sum_10 = svdup_n_f16(0.0f);
137
0
        svfloat16_t sum_11 = svdup_n_f16(0.0f);
138
0
        svfloat16_t sum_12 = svdup_n_f16(0.0f);
139
0
        svfloat16_t sum_13 = svdup_n_f16(0.0f);
140
0
141
0
        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142
0
        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
0
144
0
        for (int i = 0; i < np; i += ggml_f16_step) {
145
0
            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
0
147
0
            ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
149
0
            ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
0
152
0
            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
0
154
0
            ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155
0
            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
0
            ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157
0
            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
0
159
0
            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
160
0
161
0
            ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162
0
            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
0
            ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164
0
            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
0
166
0
            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
167
0
168
0
            ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
169
0
            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170
0
            ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
171
0
            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
0
173
0
            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
174
0
175
0
            ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
176
0
177
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178
0
            ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
179
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
0
181
0
            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
182
0
183
0
            ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
184
0
185
0
            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186
0
            ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
187
0
            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
0
189
0
            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
190
0
191
0
            ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
192
0
193
0
            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194
0
            ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
195
0
            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
0
197
0
            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
198
0
199
0
            ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
200
0
201
0
            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202
0
            ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
203
0
            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204
0
        }
205
0
206
0
        const int np2 = (n & ~(ggml_f16_epr - 1));
207
0
        for (int k = np; k < np2; k += ggml_f16_epr) {
208
0
            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
209
0
210
0
            svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
211
0
            sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
212
0
            rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
213
0
            sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
214
0
        }
215
0
216
0
        if (np2 < n) {
217
0
            svbool_t pg = svwhilelt_b16(np2, n);
218
0
            svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
219
0
            svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
220
0
            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
0
222
0
            sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223
0
            sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224
0
        }
225
0
        GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
0
        GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
0
    #elif defined(__riscv_v_intrinsic)
228
0
      // todo: RVV impl
229
0
      for (int i = 0; i < n; ++i) {
230
0
          for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
231
0
              sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232
0
          }
233
0
      }
234
0
    #else
235
0
        const int np = (n & ~(GGML_F16_STEP - 1));
236
0
237
0
        GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
238
0
239
0
        GGML_F16_VEC ax[GGML_F16_ARR];
240
0
        GGML_F16_VEC ay[GGML_F16_ARR];
241
0
242
0
        for (int i = 0; i < np; i += GGML_F16_STEP) {
243
0
            for (int j = 0; j < GGML_F16_ARR; j++) {
244
0
                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
245
0
246
0
                for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
247
0
                    ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
248
0
249
0
                    sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
250
0
                }
251
0
            }
252
0
        }
253
0
254
0
        // reduce sum0..sum3 to sum0
255
0
        for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
256
0
            GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
257
0
        }
258
0
259
0
        // leftovers
260
0
        for (int i = np; i < n; ++i) {
261
0
            for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
262
0
                sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
263
0
            }
264
0
        }
265
0
    #endif
266
0
#else
267
0
    for (int i = 0; i < n; ++i) {
268
0
        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
269
0
            sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
270
0
        }
271
0
    }
272
0
#endif
273
0
274
0
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
275
0
        s[i] = (float)sumf[i];
276
0
    }
277
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_dot_f16_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
Unexecuted instantiation: ops.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*)
278
279
0
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
280
0
#if defined(GGML_SIMD)
281
    #if defined(__ARM_FEATURE_SVE)
282
283
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
284
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
285
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
286
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
287
288
        const int np = (n & ~(ggml_f32_step - 1));
289
        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
290
        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
291
        for (int i = 0; i < np; i += ggml_f32_step) {
292
293
            ax1 = GGML_F32_VEC_LOAD(x + i);
294
            ay1 = GGML_F32_VEC_LOAD(y + i);
295
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
296
297
            GGML_F32_VEC_STORE(y + i, ay1);
298
299
            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
300
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
301
            ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
302
303
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
304
305
            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
306
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
307
            ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
308
309
            GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
310
311
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
312
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
313
            ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
314
315
            GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
316
317
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
318
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
319
            ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
320
321
            GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
322
323
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
324
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
325
            ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
326
327
            GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
328
329
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
330
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
331
            ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
332
333
            GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
334
335
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
336
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
337
            ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
338
339
            GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
340
        }
341
        // leftovers
342
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
343
        const int np2 = (n & ~(ggml_f32_epr - 1));
344
        for (int i = np; i < np2; i += ggml_f32_epr) {
345
            ax1 = GGML_F32_VEC_LOAD(x + i);
346
            ay1 = GGML_F32_VEC_LOAD(y + i);
347
            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
348
349
            GGML_F32_VEC_STORE(y + i, ay1);
350
        }
351
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
352
        if (np2 < n) {
353
            svbool_t pg =svwhilelt_b32(np2, n);
354
            ax1 = svld1_f32(pg, x + np2);
355
            ay1 = svld1_f32(pg, y + np2);
356
            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
357
358
            svst1_f32(pg, y + np2, ay1);
359
        }
360
    #elif defined(__riscv_v_intrinsic)
361
        for (int i = 0, avl; i < n; i += avl) {
362
            avl = __riscv_vsetvl_e32m8(n - i);
363
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
364
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
365
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
366
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
367
        }
368
    #else
369
0
        const int np = (n & ~(GGML_F32_STEP - 1));
370
371
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
372
373
0
        GGML_F32_VEC ax[GGML_F32_ARR];
374
0
        GGML_F32_VEC ay[GGML_F32_ARR];
375
376
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
377
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
378
0
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
379
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
380
0
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
381
382
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
383
0
            }
384
0
        }
385
386
        // leftovers
387
0
        for (int i = np; i < n; ++i) {
388
0
            y[i] += x[i]*v;
389
0
        }
390
0
    #endif
391
#else
392
    // scalar
393
    for (int i = 0; i < n; ++i) {
394
        y[i] += x[i]*v;
395
    }
396
#endif
397
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32(int, float*, float const*, float)
398
399
0
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
400
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
401
    const int sve_register_length = svcntb() * 8;
402
    const int ggml_f16_epr = sve_register_length / 16;
403
    const int ggml_f16_step = 8 * ggml_f16_epr;
404
405
    GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
406
407
    int np = (n & ~(ggml_f16_step - 1));
408
409
    svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
    svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
    for (int i = 0; i < np; i += ggml_f16_step) {
412
        ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
        ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
415
416
        GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
417
418
        ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
        ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
421
422
        GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
423
424
        ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
        ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
        ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
427
428
        GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
429
430
        ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
        ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
        ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
433
434
        GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
435
436
        ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
        ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
        ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
439
440
        GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
441
442
        ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
        ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
        ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
445
446
        GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
447
448
        ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
        ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
        ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
451
452
        GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
453
454
        ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
        ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
        ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
457
458
        GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
459
    }
460
    const int np2 = (n & ~(ggml_f16_epr - 1));
461
    for (int k = np; k < np2; k += ggml_f16_epr) {
462
        svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
        svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
        ry = GGML_F16x_VEC_FMA(ry, rx, vx);
465
466
        GGML_F16x_VEC_STORE(y + k, ry, 0);
467
    }
468
469
    if (np2 < n) {
470
        svbool_t pg = svwhilelt_b16(np2, n);
471
        svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
        svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
        hy = svmad_f16_x(pg, hx, vx, hy);
474
        svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
    }
476
    np = n;
477
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
478
    const int np = n;
479
    _Float16 hv = (_Float16)v;
480
    for (int i = 0, avl; i < n; i += avl) {
481
        avl = __riscv_vsetvl_e16m8(n - i);
482
        vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
483
        vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
484
        vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
485
        __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
486
    }
487
#elif defined(GGML_SIMD)
488
0
    const int np = (n & ~(GGML_F16_STEP - 1));
489
490
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
491
492
0
    GGML_F16_VEC ax[GGML_F16_ARR];
493
0
    GGML_F16_VEC ay[GGML_F16_ARR];
494
495
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
496
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
497
0
            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
498
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
499
0
            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
500
501
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
502
0
        }
503
0
    }
504
#else
505
    const int np = 0;
506
#endif
507
508
    // leftovers
509
0
    for (int i = np; i < n; ++i) {
510
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
511
0
    }
512
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f16
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float)
513
514
// xs and vs are byte strides of x and v
515
0
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
516
517
0
    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
518
0
    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
519
520
0
    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
521
0
        x[i] = (const float *) ((const char *) xv + i*xs);
522
0
        v[i] = (const float *) ((const char *) vv + i*vs);
523
0
    }
524
525
0
#if defined(GGML_SIMD)
526
    #if defined(__ARM_FEATURE_SVE)
527
        // scalar Route to scalar implementation       //TODO: Write SVE code
528
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
529
            for (int i = 0; i < n; ++i) {
530
                y[i] += x[k][i]*v[k][0];
531
            }
532
        }
533
    #elif defined(__riscv_v_intrinsic)
534
        for (int i = 0, avl; i < n; i += avl) {
535
            avl = __riscv_vsetvl_e32m8(n - i);
536
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
537
            for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
538
                vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
539
                ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
540
            }
541
            __riscv_vse32_v_f32m8(&y[i], ay, avl);
542
        }
543
    #else
544
0
        const int np = (n & ~(GGML_F32_STEP - 1));
545
546
0
        GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
547
548
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
549
0
            vx[k] = GGML_F32_VEC_SET1(v[k][0]);
550
0
        }
551
552
0
        GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
553
0
        GGML_F32_VEC ay[GGML_F32_ARR];
554
555
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
556
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
557
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
558
559
0
                for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
560
0
                    ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
561
0
                    ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
562
0
                }
563
564
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
565
0
            }
566
0
        }
567
568
        // leftovers
569
0
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
570
0
            for (int i = np; i < n; ++i) {
571
0
                y[i] += x[k][i]*v[k][0];
572
0
            }
573
0
        }
574
0
    #endif
575
#else
576
    // scalar
577
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
578
        for (int i = 0; i < n; ++i) {
579
            y[i] += x[k][i]*v[k][0];
580
        }
581
    }
582
#endif
583
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32_unroll
Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*)
584
585
0
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
586
#if defined(GGML_USE_ACCELERATE)
587
    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
588
#elif defined(GGML_SIMD)
589
    #if defined(__ARM_FEATURE_SVE)
590
        // scalar ; TODO: Write SVE code
591
        for (int i = 0; i < n; ++i) {
592
            y[i] = x[i]*s + b;
593
        }
594
    #elif defined(__riscv_v_intrinsic)
595
        for (int i = 0, avl; i < n; i += avl) {
596
            avl = __riscv_vsetvl_e32m8(n - i);
597
            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
598
            vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
599
            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
600
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
601
        }
602
    #else
603
0
        const int np = (n & ~(GGML_F32_STEP - 1));
604
605
0
        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
606
0
        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
607
608
0
        GGML_F32_VEC ay[GGML_F32_ARR];
609
610
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
611
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
612
0
                ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
613
0
                ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
614
615
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
616
0
            }
617
0
        }
618
619
        // leftovers
620
0
        for (int i = np; i < n; ++i) {
621
0
            y[i] = x[i]*s + b;
622
0
        }
623
0
    #endif
624
#else
625
    // scalar
626
    for (int i = 0; i < n; ++i) {
627
        y[i] = x[i]*s + b;
628
    }
629
#endif
630
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad1_f32
Unexecuted instantiation: vec.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
Unexecuted instantiation: ops.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float)
631
632
//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
633
0
inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
634
#if defined(GGML_USE_ACCELERATE)
635
    vDSP_vsmul(y, 1, &v, y, 1, n);
636
#elif defined(GGML_SIMD)
637
    #if defined(__ARM_FEATURE_SVE)
638
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
639
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
640
        const int ggml_f32_step = 2 * ggml_f32_epr;
641
642
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
643
        const int np = (n & ~(ggml_f32_step - 1));
644
        svfloat32_t ay1;
645
        svfloat32_t ay2;
646
        for (int i = 0; i < np; i += ggml_f32_step) {
647
            ay1 = GGML_F32_VEC_LOAD(y + i);
648
            ay1 = GGML_F32_VEC_MUL(ay1, vx);
649
            GGML_F32_VEC_STORE(y + i, ay1);
650
651
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
652
            ay2 = GGML_F32_VEC_MUL(ay2, vx);
653
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
654
        }
655
        // leftovers
656
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657
        for (int i = np; i < n; i += ggml_f32_epr) {
658
            svbool_t pg = svwhilelt_b32(i, n);
659
            ay1 = svld1_f32(pg, y + i);
660
            ay1 = svmul_f32_m(pg, ay1, vx);
661
            svst1_f32(pg, y + i, ay1);
662
        }
663
    #elif defined(__riscv_v_intrinsic)
664
        for (int i = 0, avl; i < n; i += avl) {
665
            avl = __riscv_vsetvl_e32m8(n - i);
666
            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
667
            vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
668
            __riscv_vse32_v_f32m8(&y[i], ny, avl);
669
        }
670
    #else
671
0
        const int np = (n & ~(GGML_F32_STEP - 1));
672
673
0
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
674
675
0
        GGML_F32_VEC ay[GGML_F32_ARR];
676
677
0
        for (int i = 0; i < np; i += GGML_F32_STEP) {
678
0
            for (int j = 0; j < GGML_F32_ARR; j++) {
679
0
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
680
0
                ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
681
682
0
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
683
0
            }
684
0
        }
685
686
        // leftovers
687
0
        for (int i = np; i < n; ++i) {
688
0
            y[i] *= v;
689
0
        }
690
0
    #endif
691
#else
692
    // scalar
693
    for (int i = 0; i < n; ++i) {
694
        y[i] *= v;
695
    }
696
#endif
697
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f32
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f32(int, float*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f32(int, float*, float)
698
699
0
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
700
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
701
    const int sve_register_length = svcntb() * 8;
702
    const int ggml_f16_epr = sve_register_length / 16;
703
    const int ggml_f16_step = 2 * ggml_f16_epr;
704
705
    GGML_F16x_VEC vx =  GGML_F16x_VEC_SET1(v);
706
    const int np = (n & ~(ggml_f16_step - 1));
707
    svfloat16_t ay1, ay2;
708
709
    for (int i = 0; i < np; i += ggml_f16_step) {
710
        ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
711
        ay1 = GGML_F16x_VEC_MUL(ay1, vx);
712
        GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
713
714
        ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
715
        ay2 = GGML_F16x_VEC_MUL(ay2, vx);
716
        GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
717
    }
718
    // leftovers
719
    // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
720
    if (np < n) {
721
        svbool_t pg = svwhilelt_b16(np, n);
722
        svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
723
        svfloat16_t out = svmul_f16_m(pg, hy, vx);
724
        svst1_f16(pg, (__fp16 *)(y + np), out);
725
    }
726
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
727
    for (int i = 0, vl; i < n; i += vl) {
728
        vl = __riscv_vsetvl_e16m2(n - i);
729
        vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
730
        vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
731
        vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
732
        vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
733
        __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
734
    }
735
#elif defined(GGML_SIMD)
736
0
    const int np = (n & ~(GGML_F16_STEP - 1));
737
738
0
    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
739
740
0
    GGML_F16_VEC ay[GGML_F16_ARR];
741
742
0
    for (int i = 0; i < np; i += GGML_F16_STEP) {
743
0
        for (int j = 0; j < GGML_F16_ARR; j++) {
744
0
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
745
0
            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
746
747
0
            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
748
0
        }
749
0
    }
750
751
    // leftovers
752
0
    for (int i = np; i < n; ++i) {
753
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
754
0
    }
755
#else
756
    // scalar
757
    for (int i = 0; i < n; ++i) {
758
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
759
    }
760
#endif
761
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f16
Unexecuted instantiation: vec.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_scale_f16(int, unsigned short*, float)
762
763
0
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_f32(int, float*, float const*)
764
0
inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f32(int, float*, float const*)
765
0
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
766
0
    for (int i = 0; i < n; ++i) {
767
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
768
0
        y[i] = GGML_CPU_FP32_TO_FP16(v*v);
769
0
    }
770
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*)
771
0
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f32(int, float*, float const*)
772
0
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
773
0
    for (int i = 0; i < n; ++i) {
774
0
        y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
775
0
    }
776
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*)
777
0
inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f32
Unexecuted instantiation: vec.cpp:ggml_vec_log_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f32(int, float*, float const*)
778
0
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
779
0
    for (int i = 0; i < n; ++i) {
780
0
        y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
781
0
    }
782
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f16
Unexecuted instantiation: vec.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*)
783
0
inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f32(int, float*, float const*)
784
0
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
785
0
    for (int i = 0; i < n; ++i) {
786
0
        y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
787
0
    }
788
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*)
789
0
inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f32(int, float*, float const*)
790
0
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
791
0
    for (int i = 0; i < n; ++i) {
792
0
        y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
793
0
    }
794
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f16
Unexecuted instantiation: vec.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*)
795
0
inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f32
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f32(int, float*, float const*)
796
0
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
797
0
    for (int i = 0; i < n; ++i) {
798
0
        y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
799
0
    }
800
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f16
Unexecuted instantiation: vec.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*)
801
0
inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f32(int, float*, float const*)
802
0
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
803
0
    for (int i = 0; i < n; ++i) {
804
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
805
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
806
0
    }
807
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*)
808
0
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f32
Unexecuted instantiation: vec.cpp:ggml_vec_step_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f32(int, float*, float const*)
809
0
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
810
0
    for (int i = 0; i < n; ++i) {
811
0
        y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
812
0
    }
813
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f16
Unexecuted instantiation: vec.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*)
814
0
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f32
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f32(int, float*, float const*)
815
0
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
816
0
    for (int i = 0; i < n; ++i) {
817
0
        y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
818
0
    }
819
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f16
Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*)
820
0
inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f32(int, float*, float const*)
821
0
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
822
0
    for (int i = 0; i < n; ++i) {
823
0
        const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
825
0
    }
826
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*)
827
0
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f32(int, float*, float const*)
828
0
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
829
0
    for (int i = 0; i < n; ++i) {
830
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
831
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
832
0
    }
833
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*)
834
0
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float)
835
0
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
836
0
    for (int i = 0; i < n; ++i) {
837
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
838
0
        y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
839
0
    }
840
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float)
841
0
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f32(int, float*, float const*)
842
0
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
843
0
    for (int i = 0; i < n; ++i) {
844
0
        y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
845
0
    }
846
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*)
847
// TODO: optimize performance
848
0
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f32(int, float*, float const*)
849
0
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
850
0
    for (int i = 0; i < n; ++i) {
851
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
852
0
        y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
853
0
    }
854
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*)
855
0
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f32
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*)
856
0
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
857
0
    for (int i = 0; i < n; ++i) {
858
0
        y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
859
0
    }
860
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f16
Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*)
861
0
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f32
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f32(int, float*, float const*)
862
0
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
863
0
    for (int i = 0; i < n; ++i) {
864
0
        y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
865
0
    }
866
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f16
Unexecuted instantiation: vec.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*)
867
868
static const float GELU_COEF_A     = 0.044715f;
869
static const float GELU_QUICK_COEF = -1.702f;
870
static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
871
static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;
872
873
0
inline static float ggml_gelu_f32(float x) {
874
0
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
875
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_gelu_f32
Unexecuted instantiation: vec.cpp:ggml_gelu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_f32(float)
876
877
0
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
878
0
    const uint16_t * i16 = (const uint16_t *) x;
879
0
    for (int i = 0; i < n; ++i) {
880
0
        y[i] = ggml_table_gelu_f16[i16[i]];
881
0
    }
882
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*)
883
884
0
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
885
0
    for (int i = 0; i < n; ++i) {
886
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
887
0
        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
888
0
        y[i] = GGML_CPU_FP32_TO_FP16(res);
889
0
    }
890
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*)
891
892
#ifdef GGML_GELU_FP16
893
0
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
894
0
    uint16_t t;
895
0
    for (int i = 0; i < n; ++i) {
896
0
        if (x[i] <= -10.0f) {
897
0
            y[i] = 0.0f;
898
0
        } else if (x[i] >= 10.0f) {
899
0
            y[i] = x[i];
900
0
        } else {
901
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
902
0
            memcpy(&t, &fp16, sizeof(uint16_t));
903
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
904
0
        }
905
0
    }
906
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f32(int, float*, float const*)
907
#else
908
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
909
    for (int i = 0; i < n; ++i) {
910
        y[i] = ggml_gelu_f32(x[i]);
911
    }
912
}
913
#endif
914
915
0
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
916
0
    for (int i = 0; i < n; ++i) {
917
0
        float xi = x[i];
918
0
        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
919
0
    }
920
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*)
921
922
0
inline static float ggml_gelu_quick_f32(float x) {
923
0
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
924
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_gelu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_gelu_quick_f32(float)
Unexecuted instantiation: ops.cpp:ggml_gelu_quick_f32(float)
925
926
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
927
//    const uint16_t * i16 = (const uint16_t *) x;
928
//    for (int i = 0; i < n; ++i) {
929
//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
930
//    }
931
//}
932
933
#ifdef GGML_GELU_QUICK_FP16
934
0
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
935
0
    uint16_t t;
936
0
    for (int i = 0; i < n; ++i) {
937
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
938
0
        memcpy(&t, &fp16, sizeof(uint16_t));
939
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
940
0
    }
941
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*)
942
#else
943
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
944
    for (int i = 0; i < n; ++i) {
945
        y[i] = ggml_gelu_quick_f32(x[i]);
946
    }
947
}
948
#endif
949
950
0
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
951
0
    for (int i = 0; i < n; ++i) {
952
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
953
0
        y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
954
0
    }
955
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*)
956
957
// Sigmoid Linear Unit (SiLU) function
958
0
inline static float ggml_silu_f32(float x) {
959
0
    return x/(1.0f + expf(-x));
960
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f32
Unexecuted instantiation: vec.cpp:ggml_silu_f32(float)
Unexecuted instantiation: ops.cpp:ggml_silu_f32(float)
961
0
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
962
0
    float v = GGML_CPU_FP16_TO_FP32(x);
963
0
    return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
964
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_f16
Unexecuted instantiation: vec.cpp:ggml_silu_f16(unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_f16(unsigned short)
965
966
#if __FINITE_MATH_ONLY__
967
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
968
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
969
#endif
970
971
/* Below function was borrowed from the GitHub repository:
972
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
973
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
974
    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
975
        // Constants
976
        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
977
        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
978
        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
979
        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
980
        const svfloat32_t one = svdup_n_f32(1.0f);
981
        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
982
        const svint32_t inactive2 = svdup_n_s32(0);
983
984
        // Algorithm starts here
985
        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
986
        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
987
        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
988
989
        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
990
        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
991
992
        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
993
        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
994
        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
995
996
        // and_(t2.d, t1.d, not_mask17.d)
997
        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
998
        t5 = svsub_f32_m(pg, t1, t5);                // z
999
        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
1000
        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
1001
        t0 = svmul_f32_m(pg, t0, t4);                // Final result
1002
1003
        return t0;
1004
    }
1005
#endif
1006
1007
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1008
1009
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1010
    const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1011
    const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1012
    const svfloat32_t n = svsub_f32_x(pg, z, r);
1013
    const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1014
    const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1015
    const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1016
    const svbool_t c = svacgt_n_f32(pg, n, 126);
1017
    const svfloat32_t u = svmul_f32_x(pg, b, b);
1018
    const svfloat32_t j = svmla_f32_x(pg,
1019
        svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1020
        svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1021
                        svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1022
    const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1023
    const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1024
    const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1025
    return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1026
                     svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1027
}
1028
1029
// computes silu x/(1+exp(-x)) in single precision vector
1030
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1031
    const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1032
    const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1033
    const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1034
    const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1035
    const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1036
    return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1037
}
1038
1039
#elif defined(__ARM_NEON) && defined(__aarch64__)
1040
1041
// adapted from arm limited optimized routine
1042
// the maximum error is 1.45358 plus 0.5 ulps
1043
// numbers above 88.38 will flush to infinity
1044
// numbers beneath -103.97 will flush to zero
1045
inline static float32x4_t ggml_v_expf(float32x4_t x) {
1046
    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
1047
    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
1048
    const float32x4_t n = vsubq_f32(z, r);
1049
    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
1050
                                    vdupq_n_f32(0x1.7f7d1cp-20f));
1051
    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
1052
    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
1053
    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
1054
    const float32x4_t u = vmulq_f32(b, b);
1055
    const float32x4_t j = vfmaq_f32(
1056
        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
1057
        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
1058
                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
1059
    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1060
        return vfmaq_f32(k, j, k);
1061
    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
1062
    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
1063
    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1064
    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
1065
                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1066
}
1067
1068
// computes silu x/(1+exp(-x)) in single precision vector
1069
inline static float32x4_t ggml_v_silu(float32x4_t x) {
1070
    const float32x4_t one = vdupq_n_f32(1.0f);
1071
    const float32x4_t zero = vdupq_n_f32(0.0f);
1072
    const float32x4_t neg_x = vsubq_f32(zero, x);
1073
    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1074
    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1075
    return vdivq_f32(x, one_plus_exp_neg_x);
1076
}
1077
1078
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
1079
1080
// adapted from arm limited optimized routine
1081
// the maximum error is 1.45358 plus 0.5 ulps
1082
// numbers above 88.38 will flush to infinity
1083
// numbers beneath -103.97 will flush to zero
1084
inline static __m512 ggml_v_expf(__m512 x) {
1085
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
1086
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
1087
  const __m512 n = _mm512_sub_ps(z, r);
1088
  const __m512 b =
1089
      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
1090
                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
1091
  const __mmask16 d =
1092
      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
1093
  const __m512 u = _mm512_mul_ps(b, b);
1094
  const __m512 j = _mm512_fmadd_ps(
1095
      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
1096
                                      _mm512_set1_ps(0x1.573e2ep-5f)),
1097
                      u,
1098
                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
1099
                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
1100
      u,
1101
      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
1102
  const __m512 res = _mm512_scalef_ps(j, n);
1103
  if (_mm512_kortestz(d, d))
1104
    return res;
1105
  const __m512 zero = _mm512_setzero_ps();
1106
  const __m512 alt = _mm512_mask_blend_ps(
1107
      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1108
  return _mm512_mask_blend_ps(d, res, alt);
1109
}
1110
1111
// computes silu x/(1+exp(-x)) in single precision vector
1112
inline static __m512 ggml_v_silu(__m512 x) {
1113
    const __m512 one = _mm512_set1_ps(1);
1114
    const __m512 zero = _mm512_setzero_ps();
1115
    const __m512 neg_x = _mm512_sub_ps(zero, x);
1116
    const __m512 exp_neg_x = ggml_v_expf(neg_x);
1117
    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1118
    return _mm512_div_ps(x, one_plus_exp_neg_x);
1119
}
1120
1121
#elif defined(__AVX2__) && defined(__FMA__)
1122
1123
// adapted from arm limited optimized routine
1124
// the maximum error is 1.45358 plus 0.5 ulps
1125
// numbers above 88.38 will flush to infinity
1126
// numbers beneath -103.97 will flush to zero
1127
0
inline static __m256 ggml_v_expf(__m256 x) {
1128
0
  const __m256 r = _mm256_set1_ps(0x1.8p23f);
1129
0
  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
1130
0
  const __m256 n = _mm256_sub_ps(z, r);
1131
0
  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
1132
0
                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
1133
0
  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
1134
0
  const __m256 k = _mm256_castsi256_ps(
1135
0
      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
1136
0
  const __m256i c = _mm256_castps_si256(
1137
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1138
0
                    _mm256_set1_ps(126), _CMP_GT_OQ));
1139
0
  const __m256 u = _mm256_mul_ps(b, b);
1140
0
  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
1141
0
                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
1142
0
                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
1143
0
                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
1144
0
                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
1145
0
  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
1146
0
    return _mm256_fmadd_ps(j, k, k);
1147
0
  const __m256i g = _mm256_and_si256(
1148
0
      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1149
0
      _mm256_set1_epi32(0x82000000u));
1150
0
  const __m256 s1 =
1151
0
      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
1152
0
  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
1153
0
  const __m256i d = _mm256_castps_si256(
1154
0
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1155
0
                    _mm256_set1_ps(192), _CMP_GT_OQ));
1156
0
  return _mm256_or_ps(
1157
0
      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
1158
0
      _mm256_andnot_ps(
1159
0
          _mm256_castsi256_ps(d),
1160
0
          _mm256_or_ps(
1161
0
              _mm256_and_ps(_mm256_castsi256_ps(c),
1162
0
                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
1163
0
              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
1164
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_expf
Unexecuted instantiation: vec.cpp:ggml_v_expf(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_expf(float __vector(8))
1165
1166
// computes silu x/(1+exp(-x)) in single precision vector
1167
0
inline static __m256 ggml_v_silu(__m256 x) {
1168
0
    const __m256 one = _mm256_set1_ps(1);
1169
0
    const __m256 zero = _mm256_setzero_ps();
1170
0
    const __m256 neg_x = _mm256_sub_ps(zero, x);
1171
0
    const __m256 exp_neg_x = ggml_v_expf(neg_x);
1172
0
    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
1173
0
    return _mm256_div_ps(x, one_plus_exp_neg_x);
1174
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_v_silu
Unexecuted instantiation: vec.cpp:ggml_v_silu(float __vector(8))
Unexecuted instantiation: ops.cpp:ggml_v_silu(float __vector(8))
1175
1176
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1177
1178
#if defined(__FMA__)
1179
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1180
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1181
#else
1182
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1183
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1184
#endif
1185
1186
// adapted from arm limited optimized routine
1187
// the maximum error is 1.45358 plus 0.5 ulps
1188
// numbers above 88.38 will flush to infinity
1189
// numbers beneath -103.97 will flush to zero
1190
inline static __m128 ggml_v_expf(__m128 x) {
1191
    const __m128 r = _mm_set1_ps(0x1.8p23f);
1192
    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
1193
    const __m128 n = _mm_sub_ps(z, r);
1194
    const __m128 b =
1195
        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
1196
    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
1197
    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
1198
    const __m128i c =
1199
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
1200
    const __m128 u = _mm_mul_ps(b, b);
1201
    const __m128 j =
1202
        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
1203
                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
1204
                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
1205
    if (!_mm_movemask_epi8(c))
1206
        return MADD128(j, k, k);
1207
    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1208
                                    _mm_set1_epi32(0x82000000u));
1209
    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
1210
    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1211
    const __m128i d =
1212
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
1213
    return _mm_or_ps(
1214
        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1215
        _mm_andnot_ps(_mm_castsi128_ps(d),
1216
                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1217
                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1218
}
1219
1220
// computes silu x/(1+exp(-x)) in single precision vector
1221
inline static __m128 ggml_v_silu(__m128 x) {
1222
    const __m128 one = _mm_set1_ps(1);
1223
    const __m128 zero = _mm_setzero_ps();
1224
    const __m128 neg_x = _mm_sub_ps(zero, x);
1225
    const __m128 exp_neg_x = ggml_v_expf(neg_x);
1226
    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1227
    return _mm_div_ps(x, one_plus_exp_neg_x);
1228
}
1229
1230
#elif defined(__riscv_v_intrinsic)
1231
1232
// adapted from arm limited optimized routine
1233
// the maximum error is 1.45358 plus 0.5 ulps
1234
// numbers above 88.38 will flush to infinity
1235
// numbers beneath -103.97 will flush to zero
1236
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1237
    const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1238
#ifdef __riscv_xtheadvector
1239
    // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1240
    vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1241
    z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1242
#else
1243
    const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1244
#endif
1245
    const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1246
    const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1247
                                                    0x1.7f7d1cp-20f, n, vl);
1248
    const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1249
    const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1250
    const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1251
    const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1252
    const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1253
        __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1254
        __riscv_vfmacc_vv_f32m2(
1255
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1256
            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1257
            u, vl), u, vl);
1258
    if (!__riscv_vcpop_m_b16(c, vl))
1259
        return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1260
    const vbool16_t  dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1261
    const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1262
    const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1263
    const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1264
    const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1265
        __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1266
        __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1267
        c, vl);
1268
    return __riscv_vmerge_vvm_f32m2(
1269
        r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1270
        __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1271
        vl);
1272
}
1273
1274
// computes silu x/(1+exp(-x)) in single precision vector
1275
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1276
    const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1277
    const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1278
    const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1279
    return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1280
}
1281
1282
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1283
1284
0
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1285
0
    for (int i = 0; i < n; ++i) {
1286
0
        y[i] = ggml_silu_f16(x[i]);
1287
0
    }
1288
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*)
1289
1290
0
inline static float ggml_silu_backward_f32(float x, float dy) {
1291
0
    const float s = 1.0f/(1.0f + expf(-x));
1292
0
    return dy*s*(1.0f + x*(1.0f - s));
1293
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f32(float, float)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f32(float, float)
1294
1295
0
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1296
0
    const float v = GGML_CPU_FP16_TO_FP32(x);
1297
0
    const float s = 1.0f/(1.0f + expf(-v));
1298
0
    return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1299
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
Unexecuted instantiation: ops.cpp:ggml_silu_backward_f16(unsigned short, unsigned short)
1300
1301
0
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1302
0
    for (int i = 0; i < n; ++i) {
1303
0
        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
1304
0
    }
1305
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f32
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*)
1306
1307
0
inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1308
0
    for (int i = 0; i < n; ++i) {
1309
0
        dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
1310
0
    }
1311
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f16
Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1312
1313
0
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1314
0
    for (int i = 0; i < n; ++i) {
1315
0
        y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1316
0
    }
1317
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*)
1318
1319
0
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1320
0
    for (int i = 0; i < n; ++i) {
1321
0
        float v = GGML_CPU_FP16_TO_FP32(x[i]);
1322
0
        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1323
0
    }
1324
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1325
1326
#ifdef GGML_GELU_FP16
1327
0
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1328
0
    uint16_t t;
1329
0
    for (int i = 0; i < n; ++i) {
1330
0
        if (x[i] <= -10.0f) {
1331
0
            y[i] = 0.0f;
1332
0
        } else if (x[i] >= 10.0f) {
1333
0
            y[i] = x[i] * g[i];
1334
0
        } else {
1335
0
            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1336
0
            memcpy(&t, &fp16, sizeof(uint16_t));
1337
0
            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1338
0
        }
1339
0
    }
1340
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*)
1341
#else
1342
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1343
    for (int i = 0; i < n; ++i) {
1344
        y[i] = ggml_gelu_f32(x[i]) * g[i];
1345
    }
1346
}
1347
#endif
1348
1349
0
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1350
0
    const uint16_t * i16 = (const uint16_t *) x;
1351
0
    for (int i = 0; i < n; ++i) {
1352
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1353
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1354
0
    }
1355
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1356
1357
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1358
1359
0
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1360
0
    for (int i = 0; i < n; ++i) {
1361
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1362
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1363
0
        y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1364
0
    }
1365
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_swiglu_f16
Unexecuted instantiation: vec.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1366
1367
0
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1368
0
    for (int i = 0; i < n; ++i) {
1369
0
        float xi = x[i];
1370
0
        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1371
0
    }
1372
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*)
1373
1374
0
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1375
0
    for (int i = 0; i < n; ++i) {
1376
0
        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1377
0
        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1378
0
        y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1379
0
    }
1380
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1381
1382
#ifdef GGML_GELU_QUICK_FP16
1383
0
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1384
0
    uint16_t t;
1385
0
    for (int i = 0; i < n; ++i) {
1386
0
        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1387
0
        memcpy(&t, &fp16, sizeof(uint16_t));
1388
0
        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1389
0
    }
1390
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f32
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*)
1391
#else
1392
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1393
    for (int i = 0; i < n; ++i) {
1394
        y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1395
    }
1396
}
1397
#endif
1398
1399
0
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1400
0
    const uint16_t * i16 = (const uint16_t *) x;
1401
0
    for (int i = 0; i < n; ++i) {
1402
0
        float v = GGML_CPU_FP16_TO_FP32(g[i]);
1403
0
        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1404
0
    }
1405
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f16
Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*)
1406
1407
0
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1408
0
#ifndef GGML_USE_ACCELERATE
1409
0
    ggml_float sum = 0.0;
1410
0
    for (int i = 0; i < n; ++i) {
1411
0
        sum += (ggml_float)x[i];
1412
0
    }
1413
0
    *s = (float)sum;
1414
#else
1415
    vDSP_sve(x, 1, s, n);
1416
#endif
1417
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32(int, float*, float const*)
1418
1419
0
inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
1420
0
    for (int i = 0; i < n; ++i) {
1421
0
        if (i == 0) {
1422
0
            y[i] = x[i];
1423
0
        } else {
1424
0
            y[i] = y[i - 1] + x[i];
1425
0
        }
1426
0
    }
1427
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_cumsum_f32
Unexecuted instantiation: vec.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_cumsum_f32(int, float*, float const*)
1428
1429
0
inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1430
0
    ggml_float sum = 0.0;
1431
0
    for (int i = 0; i < n; ++i) {
1432
0
        sum += (ggml_float)x[i];
1433
0
    }
1434
0
    *s = sum;
1435
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*)
1436
1437
0
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1438
0
    float sum = 0.0f;
1439
0
    for (int i = 0; i < n; ++i) {
1440
0
        sum += GGML_CPU_FP16_TO_FP32(x[i]);
1441
0
    }
1442
0
    *s = sum;
1443
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*)
1444
1445
0
inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1446
0
    float sum = 0.0f;
1447
0
    for (int i = 0; i < n; ++i) {
1448
0
        sum += GGML_BF16_TO_FP32(x[i]);
1449
0
    }
1450
0
    *s = sum;
1451
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_bf16_ggf
Unexecuted instantiation: vec.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
Unexecuted instantiation: ops.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*)
1452
1453
0
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1454
0
#ifndef GGML_USE_ACCELERATE
1455
0
    float max = -INFINITY;
1456
0
    for (int i = 0; i < n; ++i) {
1457
0
        max = MAX(max, x[i]);
1458
0
    }
1459
0
    *s = max;
1460
#else
1461
    vDSP_maxv(x, 1, s, n);
1462
#endif
1463
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_max_f32
Unexecuted instantiation: vec.cpp:ggml_vec_max_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_max_f32(int, float*, float const*)
1464
1465
0
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1466
0
    ggml_vec_norm_f32(n, s, x);
1467
0
    *s = 1.f/(*s);
1468
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_inv_f32
Unexecuted instantiation: vec.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_norm_inv_f32(int, float*, float const*)
1469
1470
0
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1471
0
    float max = -INFINITY;
1472
0
    int idx = 0;
1473
0
    for (int i = 0; i < n; ++i) {
1474
0
        max = MAX(max, x[i]);
1475
0
        if (max == x[i]) { idx = i; }
1476
0
    }
1477
0
    *s = idx;
1478
0
}
Unexecuted instantiation: ggml-cpu.c:ggml_vec_argmax_f32
Unexecuted instantiation: vec.cpp:ggml_vec_argmax_f32(int, int*, float const*)
Unexecuted instantiation: ops.cpp:ggml_vec_argmax_f32(int, int*, float const*)
1479
1480
#ifdef __cplusplus
1481
}
1482
#endif