/src/llama.cpp/ggml/src/ggml-cpu/vec.h
Line | Count | Source |
1 | | // Vectorized functions for fundamental operations |
2 | | |
3 | | #pragma once |
4 | | |
5 | | #include "ggml-impl.h" |
6 | | #include "simd-mappings.h" |
7 | | #include "ggml.h" |
8 | | #include "ggml-cpu.h" |
9 | | |
10 | | #if defined(GGML_USE_ACCELERATE) |
11 | | #include <Accelerate/Accelerate.h> |
12 | | #endif |
13 | | |
14 | | // floating point type used to accumulate sums |
15 | | typedef double ggml_float; |
16 | | |
17 | | #if defined(__ARM_FEATURE_SVE) |
18 | | inline static void ggml_sve_f16_fma_widened( |
19 | | svfloat32_t * acc_lo, |
20 | | svfloat32_t * acc_hi, |
21 | | svfloat16_t x, |
22 | | svfloat16_t y) { |
23 | | #if defined(__ARM_FEATURE_SVE2) |
24 | | *acc_lo = svmlalb_f32(*acc_lo, x, y); |
25 | | *acc_hi = svmlalt_f32(*acc_hi, x, y); |
26 | | #else |
27 | | // Plain SVE fallback path if SVE2 instructions not available |
28 | | svfloat16_t x_even = svtrn1_f16(x, x); |
29 | | svfloat16_t x_odd = svtrn2_f16(x, x); |
30 | | |
31 | | svfloat16_t y_even = svtrn1_f16(y, y); |
32 | | svfloat16_t y_odd = svtrn2_f16(y, y); |
33 | | |
34 | | svbool_t pg = svptrue_b32(); |
35 | | |
36 | | *acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even)); |
37 | | *acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd)); |
38 | | #endif |
39 | | } |
40 | | |
41 | | inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) { |
42 | | return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi)); |
43 | | } |
44 | | #endif |
45 | | |
46 | | #define GGML_GELU_FP16 |
47 | | #define GGML_GELU_QUICK_FP16 |
48 | | |
49 | 0 | #define GGML_SOFT_MAX_UNROLL 4 |
50 | | #define GGML_VEC_DOT_UNROLL 2 |
51 | 0 | #define GGML_VEC_MAD_UNROLL 32 |
52 | | |
53 | | #ifdef __cplusplus |
54 | | extern "C" { |
55 | | #endif |
56 | | |
57 | | // |
58 | | // global data |
59 | | // |
60 | | |
61 | | // precomputed gelu table for f16 (128 KB) |
62 | | extern ggml_fp16_t ggml_table_gelu_f16[1 << 16]; |
63 | | |
64 | | // precomputed quick gelu table for f16 (128 KB) |
65 | | extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; |
66 | | |
67 | | // |
68 | | // fundamental operations |
69 | | // |
70 | | |
71 | | void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); |
72 | | void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); |
73 | | void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); |
74 | | |
75 | | void ggml_vec_silu_f32(const int n, float * y, const float * x); |
76 | | ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean ) |
77 | | ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max); |
78 | | ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max); |
79 | | |
80 | 0 | inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i8 Unexecuted instantiation: vec.cpp:ggml_vec_set_i8(int, signed char*, signed char) Unexecuted instantiation: ops.cpp:ggml_vec_set_i8(int, signed char*, signed char) |
81 | 0 | inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i16 Unexecuted instantiation: vec.cpp:ggml_vec_set_i16(int, short*, short) Unexecuted instantiation: ops.cpp:ggml_vec_set_i16(int, short*, short) |
82 | | |
83 | 0 | inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_i32 Unexecuted instantiation: vec.cpp:ggml_vec_set_i32(int, int*, int) Unexecuted instantiation: ops.cpp:ggml_vec_set_i32(int, int*, int) |
84 | 0 | inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_i32 Unexecuted instantiation: vec.cpp:ggml_vec_cpy_i32(int, int*, int const*) Unexecuted instantiation: ops.cpp:ggml_vec_cpy_i32(int, int*, int const*) |
85 | | |
86 | 0 | inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f16 Unexecuted instantiation: vec.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short) Unexecuted instantiation: ops.cpp:ggml_vec_set_f16(int, unsigned short*, unsigned short) |
87 | 0 | inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_bf16 Unexecuted instantiation: vec.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t) Unexecuted instantiation: ops.cpp:ggml_vec_set_bf16(int, ggml_bf16_t*, ggml_bf16_t) |
88 | | |
89 | 0 | inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { |
90 | 0 | int i = 0; |
91 | 0 | #if defined(__AVX2__) |
92 | 0 | for (; i + 7 < n; i += 8) { |
93 | 0 | __m256 vx = _mm256_loadu_ps(x + i); |
94 | 0 | __m256 vy = _mm256_loadu_ps(y + i); |
95 | 0 | __m256 vz = _mm256_add_ps(vx, vy); |
96 | 0 | _mm256_storeu_ps(z + i, vz); |
97 | 0 | } |
98 | 0 | #endif |
99 | 0 | for (; i < n; ++i) { |
100 | 0 | z[i] = x[i] + y[i]; |
101 | 0 | } |
102 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f32 Unexecuted instantiation: vec.cpp:ggml_vec_add_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_add_f32(int, float*, float const*, float const*) |
103 | | |
104 | 0 | inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
105 | 0 | for (int i = 0; i < n; ++i) { |
106 | 0 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); |
107 | 0 | } |
108 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_add_f16 Unexecuted instantiation: vec.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_add_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
109 | 0 | inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_add1_f32 Unexecuted instantiation: vec.cpp:ggml_vec_add1_f32(int, float*, float const*, float) Unexecuted instantiation: ops.cpp:ggml_vec_add1_f32(int, float*, float const*, float) |
110 | 0 | inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc_f32 Unexecuted instantiation: vec.cpp:ggml_vec_acc_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_acc_f32(int, float*, float const*) |
111 | 0 | inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_acc1_f32 Unexecuted instantiation: vec.cpp:ggml_vec_acc1_f32(int, float*, float) Unexecuted instantiation: ops.cpp:ggml_vec_acc1_f32(int, float*, float) |
112 | 0 | inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sub_f32(int, float*, float const*, float const*) |
113 | 0 | inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
114 | 0 | for (int i = 0; i < n; ++i) { |
115 | 0 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); |
116 | 0 | } |
117 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sub_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sub_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
118 | 0 | inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_set_f32 Unexecuted instantiation: vec.cpp:ggml_vec_set_f32(int, float*, float) Unexecuted instantiation: ops.cpp:ggml_vec_set_f32(int, float*, float) |
119 | 0 | inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_cpy_f32 Unexecuted instantiation: vec.cpp:ggml_vec_cpy_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_cpy_f32(int, float*, float const*) |
120 | 0 | inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f32 Unexecuted instantiation: vec.cpp:ggml_vec_neg_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_neg_f32(int, float*, float const*) |
121 | 0 | inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
122 | 0 | for (int i = 0; i < n; ++i) { |
123 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i])); |
124 | 0 | } |
125 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_neg_f16 Unexecuted instantiation: vec.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_neg_f16(int, unsigned short*, unsigned short const*) |
126 | | |
127 | 0 | inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f32 Unexecuted instantiation: vec.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_mul_f32(int, float*, float const*, float const*) |
128 | 0 | inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
129 | 0 | for (int i = 0; i < n; ++i) { |
130 | 0 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); |
131 | 0 | } |
132 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_mul_f16 Unexecuted instantiation: vec.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_mul_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
133 | 0 | inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f32 Unexecuted instantiation: vec.cpp:ggml_vec_div_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_div_f32(int, float*, float const*, float const*) |
134 | 0 | inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
135 | 0 | for (int i = 0; i < n; ++i) { |
136 | 0 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i])); |
137 | 0 | } |
138 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_div_f16 Unexecuted instantiation: vec.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_div_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
139 | | |
140 | | // compute GGML_VEC_DOT_UNROLL dot products at once |
141 | | // xs - x row stride in bytes |
142 | 0 | inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { |
143 | 0 | ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; |
144 | 0 |
|
145 | 0 | ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; |
146 | 0 |
|
147 | 0 | for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { |
148 | 0 | x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); |
149 | 0 | } |
150 | 0 |
|
151 | 0 | #if defined(GGML_SIMD) |
152 | 0 | #if defined(__ARM_FEATURE_SVE) |
153 | 0 |
|
154 | 0 | const int ggml_f16_epr = svcnth(); |
155 | 0 | const int ggml_f16_step = 2 * ggml_f16_epr; |
156 | 0 | int np = n - (n % ggml_f16_step); |
157 | 0 | int np2 = n - (n % ggml_f16_epr); |
158 | 0 |
|
159 | 0 | svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f); |
160 | 0 | svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f); |
161 | 0 | svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f); |
162 | 0 | svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f); |
163 | 0 | svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f); |
164 | 0 | svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f); |
165 | 0 | svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f); |
166 | 0 | svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f); |
167 | 0 |
|
168 | 0 | for (int i = 0; i < np; i += ggml_f16_step) { |
169 | 0 | const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0); |
170 | 0 | const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0); |
171 | 0 | const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0); |
172 | 0 |
|
173 | 0 | ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0); |
174 | 0 | ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0); |
175 | 0 |
|
176 | 0 | const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0); |
177 | 0 | const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0); |
178 | 0 | const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0); |
179 | 0 |
|
180 | 0 | ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1); |
181 | 0 | ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1); |
182 | 0 | } |
183 | 0 |
|
184 | 0 | for (int i = np; i < np2; i += ggml_f16_epr) { |
185 | 0 | const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0); |
186 | 0 | const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0); |
187 | 0 | const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0); |
188 | 0 |
|
189 | 0 | ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry); |
190 | 0 | ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry); |
191 | 0 | } |
192 | 0 |
|
193 | 0 | if (np2 < n) { |
194 | 0 | const svbool_t pg = svwhilelt_b16(np2, n); |
195 | 0 | const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2)); |
196 | 0 | const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); |
197 | 0 | const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); |
198 | 0 |
|
199 | 0 | ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay); |
200 | 0 | ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay); |
201 | 0 | } |
202 | 0 |
|
203 | 0 | svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo); |
204 | 0 | svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi); |
205 | 0 | svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo); |
206 | 0 | svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi); |
207 | 0 | sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi); |
208 | 0 | sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi); |
209 | 0 | np = n; |
210 | 0 | #elif defined(__riscv_v_intrinsic) |
211 | 0 | #if defined(__riscv_zvfh) |
212 | 0 | size_t vl = __riscv_vsetvlmax_e32m4(); |
213 | 0 |
|
214 | 0 | // initialize accumulators to all zeroes |
215 | 0 | vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); |
216 | 0 | vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); |
217 | 0 | vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); |
218 | 0 | vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); |
219 | 0 |
|
220 | 0 | // calculate step size |
221 | 0 | const size_t epr = __riscv_vsetvlmax_e16m2(); |
222 | 0 | const size_t step = epr * 2; |
223 | 0 | int np = (n & ~(step - 1)); |
224 | 0 |
|
225 | 0 | // unroll by 2 along the row dimension |
226 | 0 | for (int i = 0; i < np; i += step) { |
227 | 0 | vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr); |
228 | 0 | vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr); |
229 | 0 | vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr); |
230 | 0 | vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr); |
231 | 0 | vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr); |
232 | 0 |
|
233 | 0 | vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr); |
234 | 0 | vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr); |
235 | 0 | vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr); |
236 | 0 | vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr); |
237 | 0 | vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr); |
238 | 0 | } |
239 | 0 |
|
240 | 0 | vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl); |
241 | 0 | vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl); |
242 | 0 |
|
243 | 0 | // leftovers |
244 | 0 | for (int i = np; i < n; i += vl) { |
245 | 0 | vl = __riscv_vsetvl_e16m2(n - i); |
246 | 0 | vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl); |
247 | 0 | vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl); |
248 | 0 | vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl); |
249 | 0 |
|
250 | 0 | vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl); |
251 | 0 | vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl); |
252 | 0 | } |
253 | 0 |
|
254 | 0 | // reduce |
255 | 0 | vl = __riscv_vsetvlmax_e32m2(); |
256 | 0 | vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), |
257 | 0 | __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl); |
258 | 0 | vl = __riscv_vsetvlmax_e32m1(); |
259 | 0 | vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0), |
260 | 0 | __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl); |
261 | 0 | vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1( |
262 | 0 | acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); |
263 | 0 |
|
264 | 0 | vl = __riscv_vsetvlmax_e32m2(); |
265 | 0 | vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0), |
266 | 0 | __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl); |
267 | 0 | vl = __riscv_vsetvlmax_e32m1(); |
268 | 0 | vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0), |
269 | 0 | __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl); |
270 | 0 | vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1( |
271 | 0 | acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); |
272 | 0 | sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0); |
273 | 0 | sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1); |
274 | 0 | np = n; |
275 | 0 | #else |
276 | 0 | const int np = 0; |
277 | 0 | #endif |
278 | 0 | #else |
279 | 0 | const int np = (n & ~(GGML_F16_STEP - 1)); |
280 | 0 |
|
281 | 0 | GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; |
282 | 0 |
|
283 | 0 | GGML_F16_VEC ax[GGML_F16_ARR]; |
284 | 0 | GGML_F16_VEC ay[GGML_F16_ARR]; |
285 | 0 |
|
286 | 0 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
287 | 0 | for (int j = 0; j < GGML_F16_ARR; j++) { |
288 | 0 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
289 | 0 |
|
290 | 0 | for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { |
291 | 0 | ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); |
292 | 0 |
|
293 | 0 | sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); |
294 | 0 | } |
295 | 0 | } |
296 | 0 | } |
297 | 0 |
|
298 | 0 | // reduce sum0..sum3 to sum0 |
299 | 0 | for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { |
300 | 0 | GGML_F16_VEC_REDUCE(sumf[k], sum[k]); |
301 | 0 | } |
302 | 0 | #endif |
303 | 0 | #else |
304 | 0 | // scalar path |
305 | 0 | const int np = 0; |
306 | 0 | #endif |
307 | 0 | // scalar and leftovers |
308 | 0 | for (int i = np; i < n; ++i) { |
309 | 0 | for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { |
310 | 0 | sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); |
311 | 0 | } |
312 | 0 | } |
313 | 0 |
|
314 | 0 | for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { |
315 | 0 | s[i] = (float)sumf[i]; |
316 | 0 | } |
317 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_dot_f16_unroll Unexecuted instantiation: vec.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*) Unexecuted instantiation: ops.cpp:ggml_vec_dot_f16_unroll(int, int, float*, void*, unsigned short*) |
318 | | |
319 | 0 | inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { |
320 | 0 | #if defined(GGML_SIMD) |
321 | | #if defined(__ARM_FEATURE_SVE) |
322 | | |
323 | | const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; |
324 | | const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 |
325 | | const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers |
326 | | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
327 | | |
328 | | const int np = (n & ~(ggml_f32_step - 1)); |
329 | | svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
330 | | svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
331 | | for (int i = 0; i < np; i += ggml_f32_step) { |
332 | | |
333 | | ax1 = GGML_F32_VEC_LOAD(x + i); |
334 | | ay1 = GGML_F32_VEC_LOAD(y + i); |
335 | | ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); |
336 | | |
337 | | GGML_F32_VEC_STORE(y + i, ay1); |
338 | | |
339 | | ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); |
340 | | ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); |
341 | | ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx); |
342 | | |
343 | | GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); |
344 | | |
345 | | ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); |
346 | | ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); |
347 | | ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx); |
348 | | |
349 | | GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3); |
350 | | |
351 | | ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); |
352 | | ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); |
353 | | ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx); |
354 | | |
355 | | GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4); |
356 | | |
357 | | ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); |
358 | | ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); |
359 | | ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx); |
360 | | |
361 | | GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5); |
362 | | |
363 | | ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); |
364 | | ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); |
365 | | ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx); |
366 | | |
367 | | GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6); |
368 | | |
369 | | ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); |
370 | | ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); |
371 | | ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx); |
372 | | |
373 | | GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7); |
374 | | |
375 | | ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); |
376 | | ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); |
377 | | ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx); |
378 | | |
379 | | GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8); |
380 | | } |
381 | | // leftovers |
382 | | // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop |
383 | | const int np2 = (n & ~(ggml_f32_epr - 1)); |
384 | | for (int i = np; i < np2; i += ggml_f32_epr) { |
385 | | ax1 = GGML_F32_VEC_LOAD(x + i); |
386 | | ay1 = GGML_F32_VEC_LOAD(y + i); |
387 | | ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); |
388 | | |
389 | | GGML_F32_VEC_STORE(y + i, ay1); |
390 | | } |
391 | | // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only |
392 | | if (np2 < n) { |
393 | | svbool_t pg =svwhilelt_b32(np2, n); |
394 | | ax1 = svld1_f32(pg, x + np2); |
395 | | ay1 = svld1_f32(pg, y + np2); |
396 | | ay1 = svmad_f32_m(pg, ax1, vx, ay1); |
397 | | |
398 | | svst1_f32(pg, y + np2, ay1); |
399 | | } |
400 | | #elif defined(__riscv_v_intrinsic) |
401 | | for (int i = 0, avl; i < n; i += avl) { |
402 | | avl = __riscv_vsetvl_e32m8(n - i); |
403 | | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); |
404 | | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
405 | | vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl); |
406 | | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
407 | | } |
408 | | #else |
409 | 0 | const int np = (n & ~(GGML_F32_STEP - 1)); |
410 | |
|
411 | 0 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
412 | |
|
413 | 0 | GGML_F32_VEC ax[GGML_F32_ARR]; |
414 | 0 | GGML_F32_VEC ay[GGML_F32_ARR]; |
415 | |
|
416 | 0 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
417 | 0 | for (int j = 0; j < GGML_F32_ARR; j++) { |
418 | 0 | ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); |
419 | 0 | ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); |
420 | 0 | ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); |
421 | |
|
422 | 0 | GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); |
423 | 0 | } |
424 | 0 | } |
425 | | |
426 | | // leftovers |
427 | 0 | for (int i = np; i < n; ++i) { |
428 | 0 | y[i] += x[i]*v; |
429 | 0 | } |
430 | 0 | #endif |
431 | | #else |
432 | | // scalar |
433 | | for (int i = 0; i < n; ++i) { |
434 | | y[i] += x[i]*v; |
435 | | } |
436 | | #endif |
437 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32 Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32(int, float*, float const*, float) Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32(int, float*, float const*, float) |
438 | | |
439 | 0 | inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { |
440 | | #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE) |
441 | | const int sve_register_length = svcntb() * 8; |
442 | | const int ggml_f16_epr = sve_register_length / 16; |
443 | | const int ggml_f16_step = 8 * ggml_f16_epr; |
444 | | |
445 | | GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); |
446 | | |
447 | | int np = (n & ~(ggml_f16_step - 1)); |
448 | | |
449 | | svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
450 | | svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
451 | | for (int i = 0; i < np; i += ggml_f16_step) { |
452 | | ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); |
453 | | ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); |
454 | | ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx); |
455 | | |
456 | | GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0); |
457 | | |
458 | | ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); |
459 | | ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); |
460 | | ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx); |
461 | | |
462 | | GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1); |
463 | | |
464 | | ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); |
465 | | ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); |
466 | | ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx); |
467 | | |
468 | | GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2); |
469 | | |
470 | | ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); |
471 | | ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); |
472 | | ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx); |
473 | | |
474 | | GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3); |
475 | | |
476 | | ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); |
477 | | ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); |
478 | | ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx); |
479 | | |
480 | | GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4); |
481 | | |
482 | | ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); |
483 | | ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); |
484 | | ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx); |
485 | | |
486 | | GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5); |
487 | | |
488 | | ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); |
489 | | ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); |
490 | | ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx); |
491 | | |
492 | | GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6); |
493 | | |
494 | | ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); |
495 | | ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); |
496 | | ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx); |
497 | | |
498 | | GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7); |
499 | | } |
500 | | const int np2 = (n & ~(ggml_f16_epr - 1)); |
501 | | for (int k = np; k < np2; k += ggml_f16_epr) { |
502 | | svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); |
503 | | svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); |
504 | | ry = GGML_F16x_VEC_FMA(ry, rx, vx); |
505 | | |
506 | | GGML_F16x_VEC_STORE(y + k, ry, 0); |
507 | | } |
508 | | |
509 | | if (np2 < n) { |
510 | | svbool_t pg = svwhilelt_b16(np2, n); |
511 | | svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); |
512 | | svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); |
513 | | hy = svmad_f16_x(pg, hx, vx, hy); |
514 | | svst1_f16(pg, (__fp16 *)(y + np2), hy); |
515 | | } |
516 | | np = n; |
517 | | #elif defined(__riscv_v_intrinsic) // implies __riscv_v_intrinsic |
518 | | #if defined (__riscv_zvfh) |
519 | | const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); |
520 | | const _Float16 scale = *(const _Float16*)(&s); |
521 | | |
522 | | // calculate step size |
523 | | const int epr = __riscv_vsetvlmax_e16m4(); |
524 | | const int step = epr * 2; |
525 | | int np = (n & ~(step - 1)); |
526 | | |
527 | | // unroll by 2 |
528 | | for (int i = 0; i < np; i += step) { |
529 | | vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr); |
530 | | vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); |
531 | | ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr); |
532 | | __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); |
533 | | __asm__ __volatile__ ("" ::: "memory"); |
534 | | |
535 | | vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr); |
536 | | vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); |
537 | | ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr); |
538 | | __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); |
539 | | __asm__ __volatile__ ("" ::: "memory"); |
540 | | } |
541 | | |
542 | | // leftovers |
543 | | int vl; |
544 | | for (int i = np; i < n; i += vl) { |
545 | | vl = __riscv_vsetvl_e16m4(n - i); |
546 | | vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl); |
547 | | vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); |
548 | | ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl); |
549 | | __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); |
550 | | } |
551 | | np = n; |
552 | | #else |
553 | | // fall to scalar path |
554 | | const int np = 0; |
555 | | #endif |
556 | | #elif defined(GGML_SIMD) |
557 | 0 | const int np = (n & ~(GGML_F16_STEP - 1)); |
558 | |
|
559 | 0 | GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); |
560 | |
|
561 | 0 | GGML_F16_VEC ax[GGML_F16_ARR]; |
562 | 0 | GGML_F16_VEC ay[GGML_F16_ARR]; |
563 | |
|
564 | 0 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
565 | 0 | for (int j = 0; j < GGML_F16_ARR; j++) { |
566 | 0 | ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); |
567 | 0 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
568 | 0 | ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); |
569 | |
|
570 | 0 | GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); |
571 | 0 | } |
572 | 0 | } |
573 | | #else |
574 | | // scalar path |
575 | | const int np = 0; |
576 | | #endif |
577 | | |
578 | | // scalar and leftovers |
579 | 0 | for (int i = np; i < n; ++i) { |
580 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); |
581 | 0 | } |
582 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f16 Unexecuted instantiation: vec.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float) Unexecuted instantiation: ops.cpp:ggml_vec_mad_f16(int, unsigned short*, unsigned short const*, float) |
583 | | |
584 | | // xs and vs are byte strides of x and v |
585 | 0 | inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { |
586 | |
|
587 | 0 | const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; |
588 | 0 | const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; |
589 | |
|
590 | 0 | for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { |
591 | 0 | x[i] = (const float *) ((const char *) xv + i*xs); |
592 | 0 | v[i] = (const float *) ((const char *) vv + i*vs); |
593 | 0 | } |
594 | |
|
595 | 0 | #if defined(GGML_SIMD) |
596 | | #if defined(__ARM_FEATURE_SVE) |
597 | | // scalar Route to scalar implementation //TODO: Write SVE code |
598 | | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
599 | | for (int i = 0; i < n; ++i) { |
600 | | y[i] += x[k][i]*v[k][0]; |
601 | | } |
602 | | } |
603 | | #elif defined(__riscv_v_intrinsic) |
604 | | for (int i = 0, avl; i < n; i += avl) { |
605 | | avl = __riscv_vsetvl_e32m8(n - i); |
606 | | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
607 | | for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) { |
608 | | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl); |
609 | | ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl); |
610 | | } |
611 | | __riscv_vse32_v_f32m8(&y[i], ay, avl); |
612 | | } |
613 | | #else |
614 | 0 | const int np = (n & ~(GGML_F32_STEP - 1)); |
615 | |
|
616 | 0 | GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; |
617 | |
|
618 | 0 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
619 | 0 | vx[k] = GGML_F32_VEC_SET1(v[k][0]); |
620 | 0 | } |
621 | |
|
622 | 0 | GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; |
623 | 0 | GGML_F32_VEC ay[GGML_F32_ARR]; |
624 | |
|
625 | 0 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
626 | 0 | for (int j = 0; j < GGML_F32_ARR; j++) { |
627 | 0 | ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); |
628 | |
|
629 | 0 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
630 | 0 | ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR); |
631 | 0 | ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); |
632 | 0 | } |
633 | |
|
634 | 0 | GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); |
635 | 0 | } |
636 | 0 | } |
637 | | |
638 | | // leftovers |
639 | 0 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
640 | 0 | for (int i = np; i < n; ++i) { |
641 | 0 | y[i] += x[k][i]*v[k][0]; |
642 | 0 | } |
643 | 0 | } |
644 | 0 | #endif |
645 | | #else |
646 | | // scalar |
647 | | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
648 | | for (int i = 0; i < n; ++i) { |
649 | | y[i] += x[k][i]*v[k][0]; |
650 | | } |
651 | | } |
652 | | #endif |
653 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad_f32_unroll Unexecuted instantiation: vec.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_mad_f32_unroll(int, int, int, float*, float const*, float const*) |
654 | | |
655 | 0 | inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) { |
656 | | #if defined(GGML_USE_ACCELERATE) |
657 | | vDSP_vsmsa(x, 1, &s, &b, y, 1, n); |
658 | | #elif defined(GGML_SIMD) |
659 | | #if defined(__ARM_FEATURE_SVE) |
660 | | // scalar ; TODO: Write SVE code |
661 | | for (int i = 0; i < n; ++i) { |
662 | | y[i] = x[i]*s + b; |
663 | | } |
664 | | #elif defined(__riscv_v_intrinsic) |
665 | | for (int i = 0, avl; i < n; i += avl) { |
666 | | avl = __riscv_vsetvl_e32m8(n - i); |
667 | | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); |
668 | | vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl); |
669 | | vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl); |
670 | | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
671 | | } |
672 | | #else |
673 | 0 | const int np = (n & ~(GGML_F32_STEP - 1)); |
674 | |
|
675 | 0 | GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); |
676 | 0 | GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); |
677 | |
|
678 | 0 | GGML_F32_VEC ay[GGML_F32_ARR]; |
679 | |
|
680 | 0 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
681 | 0 | for (int j = 0; j < GGML_F32_ARR; j++) { |
682 | 0 | ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); |
683 | 0 | ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs); |
684 | |
|
685 | 0 | GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); |
686 | 0 | } |
687 | 0 | } |
688 | | |
689 | | // leftovers |
690 | 0 | for (int i = np; i < n; ++i) { |
691 | 0 | y[i] = x[i]*s + b; |
692 | 0 | } |
693 | 0 | #endif |
694 | | #else |
695 | | // scalar |
696 | | for (int i = 0; i < n; ++i) { |
697 | | y[i] = x[i]*s + b; |
698 | | } |
699 | | #endif |
700 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_mad1_f32 Unexecuted instantiation: vec.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float) Unexecuted instantiation: ops.cpp:ggml_vec_mad1_f32(int, float*, float const*, float, float) |
701 | | |
702 | | //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } |
703 | 0 | inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { |
704 | | #if defined(GGML_USE_ACCELERATE) |
705 | | vDSP_vsmul(y, 1, &v, y, 1, n); |
706 | | #elif defined(GGML_SIMD) |
707 | | #if defined(__ARM_FEATURE_SVE) |
708 | | const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; |
709 | | const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 |
710 | | const int ggml_f32_step = 2 * ggml_f32_epr; |
711 | | |
712 | | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
713 | | const int np = (n & ~(ggml_f32_step - 1)); |
714 | | svfloat32_t ay1; |
715 | | svfloat32_t ay2; |
716 | | for (int i = 0; i < np; i += ggml_f32_step) { |
717 | | ay1 = GGML_F32_VEC_LOAD(y + i); |
718 | | ay1 = GGML_F32_VEC_MUL(ay1, vx); |
719 | | GGML_F32_VEC_STORE(y + i, ay1); |
720 | | |
721 | | ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); |
722 | | ay2 = GGML_F32_VEC_MUL(ay2, vx); |
723 | | GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); |
724 | | } |
725 | | // leftovers |
726 | | // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only |
727 | | for (int i = np; i < n; i += ggml_f32_epr) { |
728 | | svbool_t pg = svwhilelt_b32(i, n); |
729 | | ay1 = svld1_f32(pg, y + i); |
730 | | ay1 = svmul_f32_m(pg, ay1, vx); |
731 | | svst1_f32(pg, y + i, ay1); |
732 | | } |
733 | | #elif defined(__riscv_v_intrinsic) |
734 | | for (int i = 0, avl; i < n; i += avl) { |
735 | | avl = __riscv_vsetvl_e32m8(n - i); |
736 | | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
737 | | vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl); |
738 | | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
739 | | } |
740 | | #else |
741 | 0 | const int np = (n & ~(GGML_F32_STEP - 1)); |
742 | |
|
743 | 0 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
744 | |
|
745 | 0 | GGML_F32_VEC ay[GGML_F32_ARR]; |
746 | |
|
747 | 0 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
748 | 0 | for (int j = 0; j < GGML_F32_ARR; j++) { |
749 | 0 | ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); |
750 | 0 | ay[j] = GGML_F32_VEC_MUL(ay[j], vx); |
751 | |
|
752 | 0 | GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); |
753 | 0 | } |
754 | 0 | } |
755 | | |
756 | | // leftovers |
757 | 0 | for (int i = np; i < n; ++i) { |
758 | 0 | y[i] *= v; |
759 | 0 | } |
760 | 0 | #endif |
761 | | #else |
762 | | // scalar |
763 | | for (int i = 0; i < n; ++i) { |
764 | | y[i] *= v; |
765 | | } |
766 | | #endif |
767 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f32 Unexecuted instantiation: vec.cpp:ggml_vec_scale_f32(int, float*, float) Unexecuted instantiation: ops.cpp:ggml_vec_scale_f32(int, float*, float) |
768 | | |
769 | 0 | inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { |
770 | | #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE) |
771 | | const int sve_register_length = svcntb() * 8; |
772 | | const int ggml_f16_epr = sve_register_length / 16; |
773 | | const int ggml_f16_step = 2 * ggml_f16_epr; |
774 | | |
775 | | GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); |
776 | | int np = (n & ~(ggml_f16_step - 1)); |
777 | | svfloat16_t ay1, ay2; |
778 | | |
779 | | for (int i = 0; i < np; i += ggml_f16_step) { |
780 | | ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); |
781 | | ay1 = GGML_F16x_VEC_MUL(ay1, vx); |
782 | | GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); |
783 | | |
784 | | ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); |
785 | | ay2 = GGML_F16x_VEC_MUL(ay2, vx); |
786 | | GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); |
787 | | } |
788 | | // leftovers |
789 | | // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only |
790 | | if (np < n) { |
791 | | svbool_t pg = svwhilelt_b16(np, n); |
792 | | svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); |
793 | | svfloat16_t out = svmul_f16_m(pg, hy, vx); |
794 | | svst1_f16(pg, (__fp16 *)(y + np), out); |
795 | | } |
796 | | np = n; |
797 | | #elif defined(__riscv_v_intrinsic) |
798 | | #if defined(__riscv_zvfh) |
799 | | const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); |
800 | | const _Float16 scale = *(const _Float16*)(&s); |
801 | | |
802 | | // calculate step size |
803 | | const int epr = __riscv_vsetvlmax_e16m4(); |
804 | | const int step = epr * 2; |
805 | | int np = (n & ~(step - 1)); |
806 | | |
807 | | // unroll by 2 |
808 | | for (int i = 0; i < np; i += step) { |
809 | | vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); |
810 | | ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr); |
811 | | __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); |
812 | | __asm__ __volatile__ ("" ::: "memory"); |
813 | | |
814 | | vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); |
815 | | ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr); |
816 | | __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); |
817 | | __asm__ __volatile__ ("" ::: "memory"); |
818 | | } |
819 | | |
820 | | // leftovers |
821 | | int vl; |
822 | | for (int i = np; i < n; i += vl) { |
823 | | vl = __riscv_vsetvl_e16m4(n - i); |
824 | | vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); |
825 | | ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl); |
826 | | __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); |
827 | | } |
828 | | np = n; |
829 | | #else |
830 | | // fall to scalar path |
831 | | const int np = 0; |
832 | | #endif |
833 | | #elif defined(GGML_SIMD) |
834 | 0 | const int np = (n & ~(GGML_F16_STEP - 1)); |
835 | |
|
836 | 0 | GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); |
837 | |
|
838 | 0 | GGML_F16_VEC ay[GGML_F16_ARR]; |
839 | |
|
840 | 0 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
841 | 0 | for (int j = 0; j < GGML_F16_ARR; j++) { |
842 | 0 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
843 | 0 | ay[j] = GGML_F16_VEC_MUL(ay[j], vx); |
844 | |
|
845 | 0 | GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); |
846 | 0 | } |
847 | 0 | } |
848 | | #else |
849 | | // scalar path |
850 | | const int np = 0; |
851 | | #endif |
852 | | // scalar and leftovers |
853 | 0 | for (int i = np; i < n; ++i) { |
854 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); |
855 | 0 | } |
856 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_scale_f16 Unexecuted instantiation: vec.cpp:ggml_vec_scale_f16(int, unsigned short*, float) Unexecuted instantiation: ops.cpp:ggml_vec_scale_f16(int, unsigned short*, float) |
857 | | |
858 | 0 | inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_f32 Unexecuted instantiation: vec.cpp:ggml_vec_norm_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_norm_f32(int, float*, float const*) |
859 | 0 | inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f32(int, float*, float const*) |
860 | 0 | inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
861 | 0 | for (int i = 0; i < n; ++i) { |
862 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
863 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(v*v); |
864 | 0 | } |
865 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqr_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sqr_f16(int, unsigned short*, unsigned short const*) |
866 | 0 | inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f32(int, float*, float const*) |
867 | 0 | inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
868 | 0 | for (int i = 0; i < n; ++i) { |
869 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i]))); |
870 | 0 | } |
871 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sqrt_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sqrt_f16(int, unsigned short*, unsigned short const*) |
872 | 0 | inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f32 Unexecuted instantiation: vec.cpp:ggml_vec_log_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_log_f32(int, float*, float const*) |
873 | 0 | inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
874 | 0 | for (int i = 0; i < n; ++i) { |
875 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i]))); |
876 | 0 | } |
877 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_log_f16 Unexecuted instantiation: vec.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_log_f16(int, unsigned short*, unsigned short const*) |
878 | 0 | inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sin_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sin_f32(int, float*, float const*) |
879 | 0 | inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
880 | 0 | for (int i = 0; i < n; ++i) { |
881 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i]))); |
882 | 0 | } |
883 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sin_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sin_f16(int, unsigned short*, unsigned short const*) |
884 | 0 | inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f32 Unexecuted instantiation: vec.cpp:ggml_vec_cos_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_cos_f32(int, float*, float const*) |
885 | 0 | inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
886 | 0 | for (int i = 0; i < n; ++i) { |
887 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i]))); |
888 | 0 | } |
889 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_cos_f16 Unexecuted instantiation: vec.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_cos_f16(int, unsigned short*, unsigned short const*) |
890 | 0 | inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f32 Unexecuted instantiation: vec.cpp:ggml_vec_abs_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_abs_f32(int, float*, float const*) |
891 | 0 | inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
892 | 0 | for (int i = 0; i < n; ++i) { |
893 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i]))); |
894 | 0 | } |
895 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_abs_f16 Unexecuted instantiation: vec.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_abs_f16(int, unsigned short*, unsigned short const*) |
896 | 0 | inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f32(int, float*, float const*) |
897 | 0 | inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
898 | 0 | for (int i = 0; i < n; ++i) { |
899 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
900 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); |
901 | 0 | } |
902 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sgn_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sgn_f16(int, unsigned short*, unsigned short const*) |
903 | 0 | inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f32 Unexecuted instantiation: vec.cpp:ggml_vec_step_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_step_f32(int, float*, float const*) |
904 | 0 | inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
905 | 0 | for (int i = 0; i < n; ++i) { |
906 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); |
907 | 0 | } |
908 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_step_f16 Unexecuted instantiation: vec.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_step_f16(int, unsigned short*, unsigned short const*) |
909 | 0 | inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f32 Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f32(int, float*, float const*) |
910 | 0 | inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
911 | 0 | for (int i = 0; i < n; ++i) { |
912 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i]))); |
913 | 0 | } |
914 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_tanh_f16 Unexecuted instantiation: vec.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_tanh_f16(int, unsigned short*, unsigned short const*) |
915 | 0 | inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_elu_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_elu_f32(int, float*, float const*) |
916 | 0 | inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
917 | 0 | for (int i = 0; i < n; ++i) { |
918 | 0 | const float v = GGML_CPU_FP16_TO_FP32(x[i]); |
919 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v)); |
920 | 0 | } |
921 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_elu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_elu_f16(int, unsigned short*, unsigned short const*) |
922 | 0 | inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_relu_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_relu_f32(int, float*, float const*) |
923 | 0 | inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
924 | 0 | for (int i = 0; i < n; ++i) { |
925 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
926 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f); |
927 | 0 | } |
928 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_relu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_relu_f16(int, unsigned short*, unsigned short const*) |
929 | 0 | inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float) Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f32(int, float*, float const*, float) |
930 | 0 | inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { |
931 | 0 | for (int i = 0; i < n; ++i) { |
932 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
933 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); |
934 | 0 | } |
935 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_leaky_relu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float) Unexecuted instantiation: ops.cpp:ggml_vec_leaky_relu_f16(int, unsigned short*, unsigned short const*, float) |
936 | 0 | inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f32(int, float*, float const*) |
937 | 0 | inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
938 | 0 | for (int i = 0; i < n; ++i) { |
939 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); |
940 | 0 | } |
941 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sigmoid_f16 Unexecuted instantiation: vec.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sigmoid_f16(int, unsigned short*, unsigned short const*) |
942 | | // TODO: optimize performance |
943 | 0 | inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f32 Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f32(int, float*, float const*) |
944 | 0 | inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
945 | 0 | for (int i = 0; i < n; ++i) { |
946 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
947 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); |
948 | 0 | } |
949 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardswish_f16 Unexecuted instantiation: vec.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_hardswish_f16(int, unsigned short*, unsigned short const*) |
950 | 0 | inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f32 Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f32(int, float*, float const*) |
951 | 0 | inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
952 | 0 | for (int i = 0; i < n; ++i) { |
953 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); |
954 | 0 | } |
955 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_hardsigmoid_f16 Unexecuted instantiation: vec.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_hardsigmoid_f16(int, unsigned short*, unsigned short const*) |
956 | 0 | inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f32 Unexecuted instantiation: vec.cpp:ggml_vec_exp_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_exp_f32(int, float*, float const*) |
957 | 0 | inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
958 | 0 | for (int i = 0; i < n; ++i) { |
959 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i]))); |
960 | 0 | } |
961 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_exp_f16 Unexecuted instantiation: vec.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_exp_f16(int, unsigned short*, unsigned short const*) |
962 | | |
963 | | static const float GELU_COEF_A = 0.044715f; |
964 | | static const float GELU_QUICK_COEF = -1.702f; |
965 | | static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; |
966 | | static const float SQRT_2_INV = 0.70710678118654752440084436210484f; |
967 | | |
968 | 65.5k | inline static float ggml_gelu_f32(float x) { |
969 | 65.5k | return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); |
970 | 65.5k | } Line | Count | Source | 968 | 65.5k | inline static float ggml_gelu_f32(float x) { | 969 | 65.5k | return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); | 970 | 65.5k | } |
Unexecuted instantiation: vec.cpp:ggml_gelu_f32(float) Unexecuted instantiation: ops.cpp:ggml_gelu_f32(float) |
971 | | |
972 | 0 | inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
973 | 0 | const uint16_t * i16 = (const uint16_t *) x; |
974 | 0 | for (int i = 0; i < n; ++i) { |
975 | 0 | y[i] = ggml_table_gelu_f16[i16[i]]; |
976 | 0 | } |
977 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f16(int, unsigned short*, unsigned short const*) |
978 | | |
979 | 0 | inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
980 | 0 | for (int i = 0; i < n; ++i) { |
981 | 0 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
982 | 0 | float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); |
983 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(res); |
984 | 0 | } |
985 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f16 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f16(int, unsigned short*, unsigned short const*) |
986 | | |
987 | | #ifdef GGML_GELU_FP16 |
988 | 0 | inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { |
989 | 0 | uint16_t t; |
990 | 0 | for (int i = 0; i < n; ++i) { |
991 | 0 | if (x[i] <= -10.0f) { |
992 | 0 | y[i] = 0.0f; |
993 | 0 | } else if (x[i] >= 10.0f) { |
994 | 0 | y[i] = x[i]; |
995 | 0 | } else { |
996 | 0 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
997 | 0 | memcpy(&t, &fp16, sizeof(uint16_t)); |
998 | 0 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]); |
999 | 0 | } |
1000 | 0 | } |
1001 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_f32(int, float*, float const*) |
1002 | | #else |
1003 | | inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { |
1004 | | for (int i = 0; i < n; ++i) { |
1005 | | y[i] = ggml_gelu_f32(x[i]); |
1006 | | } |
1007 | | } |
1008 | | #endif |
1009 | | |
1010 | 0 | inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) { |
1011 | 0 | for (int i = 0; i < n; ++i) { |
1012 | 0 | float xi = x[i]; |
1013 | 0 | y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); |
1014 | 0 | } |
1015 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_erf_f32 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_erf_f32(int, float*, float const*) |
1016 | | |
1017 | 65.5k | inline static float ggml_gelu_quick_f32(float x) { |
1018 | 65.5k | return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); |
1019 | 65.5k | } ggml-cpu.c:ggml_gelu_quick_f32 Line | Count | Source | 1017 | 65.5k | inline static float ggml_gelu_quick_f32(float x) { | 1018 | 65.5k | return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); | 1019 | 65.5k | } |
Unexecuted instantiation: vec.cpp:ggml_gelu_quick_f32(float) Unexecuted instantiation: ops.cpp:ggml_gelu_quick_f32(float) |
1020 | | |
1021 | 0 | inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
1022 | 0 | const uint16_t * i16 = (const uint16_t *) x; |
1023 | 0 | for (int i = 0; i < n; ++i) { |
1024 | 0 | y[i] = ggml_table_gelu_quick_f16[i16[i]]; |
1025 | 0 | } |
1026 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f16 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f16(int, unsigned short*, unsigned short const*) |
1027 | | |
1028 | | #ifdef GGML_GELU_QUICK_FP16 |
1029 | 0 | inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { |
1030 | 0 | uint16_t t; |
1031 | 0 | for (int i = 0; i < n; ++i) { |
1032 | 0 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
1033 | 0 | memcpy(&t, &fp16, sizeof(uint16_t)); |
1034 | 0 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); |
1035 | 0 | } |
1036 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_gelu_quick_f32 Unexecuted instantiation: vec.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_gelu_quick_f32(int, float*, float const*) |
1037 | | #else |
1038 | | inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { |
1039 | | for (int i = 0; i < n; ++i) { |
1040 | | y[i] = ggml_gelu_quick_f32(x[i]); |
1041 | | } |
1042 | | } |
1043 | | #endif |
1044 | | |
1045 | | // Sigmoid Linear Unit (SiLU) function |
1046 | 0 | inline static float ggml_silu_f32(float x) { |
1047 | 0 | return x/(1.0f + expf(-x)); |
1048 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_silu_f32 Unexecuted instantiation: vec.cpp:ggml_silu_f32(float) Unexecuted instantiation: ops.cpp:ggml_silu_f32(float) |
1049 | 0 | inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { |
1050 | 0 | float v = GGML_CPU_FP16_TO_FP32(x); |
1051 | 0 | return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v))); |
1052 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_silu_f16 Unexecuted instantiation: vec.cpp:ggml_silu_f16(unsigned short) Unexecuted instantiation: ops.cpp:ggml_silu_f16(unsigned short) |
1053 | | |
1054 | | #if __FINITE_MATH_ONLY__ |
1055 | | #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" |
1056 | | #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461" |
1057 | | #endif |
1058 | | |
1059 | | /* Below function was borrowed from the GitHub repository: |
1060 | | https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */ |
1061 | | #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
1062 | | inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) { |
1063 | | // Constants |
1064 | | const svfloat32_t log2_e = svdup_n_f32(1.4426950409f); |
1065 | | const svfloat32_t ln2 = svdup_n_f32(0.6931473921f); |
1066 | | const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f); |
1067 | | const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1)); |
1068 | | const svfloat32_t one = svdup_n_f32(1.0f); |
1069 | | const svfloat32_t inactive1 = svdup_n_f32(0.0f); |
1070 | | const svint32_t inactive2 = svdup_n_s32(0); |
1071 | | |
1072 | | // Algorithm starts here |
1073 | | svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e) |
1074 | | svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float) |
1075 | | svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n |
1076 | | |
1077 | | t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y) |
1078 | | t1 = svadd_f32_m(pg, t1, one); // b = a + 1 |
1079 | | |
1080 | | svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32) |
1081 | | svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v) |
1082 | | t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n) |
1083 | | |
1084 | | // and_(t2.d, t1.d, not_mask17.d) |
1085 | | svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17)); |
1086 | | t5 = svsub_f32_m(pg, t1, t5); // z |
1087 | | t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z |
1088 | | t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z) |
1089 | | t0 = svmul_f32_m(pg, t0, t4); // Final result |
1090 | | |
1091 | | return t0; |
1092 | | } |
1093 | | #endif |
1094 | | |
1095 | | #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
1096 | | |
1097 | | inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) { |
1098 | | const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f); |
1099 | | const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f); |
1100 | | const svfloat32_t n = svsub_f32_x(pg, z, r); |
1101 | | const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f); |
1102 | | const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23); |
1103 | | const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1)))); |
1104 | | const svbool_t c = svacgt_n_f32(pg, n, 126); |
1105 | | const svfloat32_t u = svmul_f32_x(pg, b, b); |
1106 | | const svfloat32_t j = svmla_f32_x(pg, |
1107 | | svmul_n_f32_x(pg, b, 0x1.ffffecp-1f), |
1108 | | svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b), |
1109 | | svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u); |
1110 | | const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000); |
1111 | | const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000)); |
1112 | | const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d)); |
1113 | | return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1), |
1114 | | svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j))); |
1115 | | } |
1116 | | |
1117 | | // computes silu x/(1+exp(-x)) in single precision vector |
1118 | | inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) { |
1119 | | const svfloat32_t one = svdup_n_f32_x(pg, 1.0f); |
1120 | | const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f); |
1121 | | const svfloat32_t neg_x = svsub_f32_x(pg, zero, x); |
1122 | | const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x); |
1123 | | const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x); |
1124 | | return svdiv_f32_x(pg, x, one_plus_exp_neg_x); |
1125 | | } |
1126 | | |
1127 | | #elif defined(__ARM_NEON) && defined(__aarch64__) |
1128 | | |
1129 | | // adapted from arm limited optimized routine |
1130 | | // the maximum error is 1.45358 plus 0.5 ulps |
1131 | | // numbers above 88.38 will flush to infinity |
1132 | | // numbers beneath -103.97 will flush to zero |
1133 | | inline static float32x4_t ggml_v_expf(float32x4_t x) { |
1134 | | const float32x4_t r = vdupq_n_f32(0x1.8p23f); |
1135 | | const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); |
1136 | | const float32x4_t n = vsubq_f32(z, r); |
1137 | | const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, |
1138 | | vdupq_n_f32(0x1.7f7d1cp-20f)); |
1139 | | const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); |
1140 | | const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); |
1141 | | const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); |
1142 | | const float32x4_t u = vmulq_f32(b, b); |
1143 | | const float32x4_t j = vfmaq_f32( |
1144 | | vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), |
1145 | | vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), |
1146 | | vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); |
1147 | | if (!vpaddd_u64(vreinterpretq_u64_u32(c))) |
1148 | | return vfmaq_f32(k, j, k); |
1149 | | const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); |
1150 | | const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); |
1151 | | const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); |
1152 | | return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), |
1153 | | vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); |
1154 | | } |
1155 | | |
1156 | | // computes silu x/(1+exp(-x)) in single precision vector |
1157 | | inline static float32x4_t ggml_v_silu(float32x4_t x) { |
1158 | | const float32x4_t one = vdupq_n_f32(1.0f); |
1159 | | const float32x4_t zero = vdupq_n_f32(0.0f); |
1160 | | const float32x4_t neg_x = vsubq_f32(zero, x); |
1161 | | const float32x4_t exp_neg_x = ggml_v_expf(neg_x); |
1162 | | const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); |
1163 | | return vdivq_f32(x, one_plus_exp_neg_x); |
1164 | | } |
1165 | | |
1166 | | #elif defined(__AVX512F__) && defined(__AVX512DQ__) |
1167 | | |
1168 | | // adapted from arm limited optimized routine |
1169 | | // the maximum error is 1.45358 plus 0.5 ulps |
1170 | | // numbers above 88.38 will flush to infinity |
1171 | | // numbers beneath -103.97 will flush to zero |
1172 | | inline static __m512 ggml_v_expf(__m512 x) { |
1173 | | const __m512 r = _mm512_set1_ps(0x1.8p23f); |
1174 | | const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); |
1175 | | const __m512 n = _mm512_sub_ps(z, r); |
1176 | | const __m512 b = |
1177 | | _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), |
1178 | | _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); |
1179 | | const __mmask16 d = |
1180 | | _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); |
1181 | | const __m512 u = _mm512_mul_ps(b, b); |
1182 | | const __m512 j = _mm512_fmadd_ps( |
1183 | | _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, |
1184 | | _mm512_set1_ps(0x1.573e2ep-5f)), |
1185 | | u, |
1186 | | _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, |
1187 | | _mm512_set1_ps(0x1.fffdb6p-2f))), |
1188 | | u, |
1189 | | _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F))); |
1190 | | const __m512 res = _mm512_scalef_ps(j, n); |
1191 | | if (_mm512_kortestz(d, d)) |
1192 | | return res; |
1193 | | const __m512 zero = _mm512_setzero_ps(); |
1194 | | const __m512 alt = _mm512_mask_blend_ps( |
1195 | | _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero); |
1196 | | return _mm512_mask_blend_ps(d, res, alt); |
1197 | | } |
1198 | | |
1199 | | // computes silu x/(1+exp(-x)) in single precision vector |
1200 | | inline static __m512 ggml_v_silu(__m512 x) { |
1201 | | const __m512 one = _mm512_set1_ps(1); |
1202 | | const __m512 zero = _mm512_setzero_ps(); |
1203 | | const __m512 neg_x = _mm512_sub_ps(zero, x); |
1204 | | const __m512 exp_neg_x = ggml_v_expf(neg_x); |
1205 | | const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); |
1206 | | return _mm512_div_ps(x, one_plus_exp_neg_x); |
1207 | | } |
1208 | | |
1209 | | #elif defined(__AVX2__) && defined(__FMA__) |
1210 | | |
1211 | | // adapted from arm limited optimized routine |
1212 | | // the maximum error is 1.45358 plus 0.5 ulps |
1213 | | // numbers above 88.38 will flush to infinity |
1214 | | // numbers beneath -103.97 will flush to zero |
1215 | 0 | inline static __m256 ggml_v_expf(__m256 x) { |
1216 | 0 | const __m256 r = _mm256_set1_ps(0x1.8p23f); |
1217 | 0 | const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); |
1218 | 0 | const __m256 n = _mm256_sub_ps(z, r); |
1219 | 0 | const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), |
1220 | 0 | _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); |
1221 | 0 | const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); |
1222 | 0 | const __m256 k = _mm256_castsi256_ps( |
1223 | 0 | _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); |
1224 | 0 | const __m256i c = _mm256_castps_si256( |
1225 | 0 | _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), |
1226 | 0 | _mm256_set1_ps(126), _CMP_GT_OQ)); |
1227 | 0 | const __m256 u = _mm256_mul_ps(b, b); |
1228 | 0 | const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, |
1229 | 0 | _mm256_set1_ps(0x1.573e2ep-5f)), u, |
1230 | 0 | _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, |
1231 | 0 | _mm256_set1_ps(0x1.fffdb6p-2f))), |
1232 | 0 | u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); |
1233 | 0 | if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) |
1234 | 0 | return _mm256_fmadd_ps(j, k, k); |
1235 | 0 | const __m256i g = _mm256_and_si256( |
1236 | 0 | _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), |
1237 | 0 | _mm256_set1_epi32(0x82000000u)); |
1238 | 0 | const __m256 s1 = |
1239 | 0 | _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); |
1240 | 0 | const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); |
1241 | 0 | const __m256i d = _mm256_castps_si256( |
1242 | 0 | _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), |
1243 | 0 | _mm256_set1_ps(192), _CMP_GT_OQ)); |
1244 | 0 | return _mm256_or_ps( |
1245 | 0 | _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), |
1246 | 0 | _mm256_andnot_ps( |
1247 | 0 | _mm256_castsi256_ps(d), |
1248 | 0 | _mm256_or_ps( |
1249 | 0 | _mm256_and_ps(_mm256_castsi256_ps(c), |
1250 | 0 | _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), |
1251 | 0 | _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); |
1252 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_v_expf Unexecuted instantiation: vec.cpp:ggml_v_expf(float __vector(8)) Unexecuted instantiation: ops.cpp:ggml_v_expf(float __vector(8)) |
1253 | | |
1254 | | // computes silu x/(1+exp(-x)) in single precision vector |
1255 | 0 | inline static __m256 ggml_v_silu(__m256 x) { |
1256 | 0 | const __m256 one = _mm256_set1_ps(1); |
1257 | 0 | const __m256 zero = _mm256_setzero_ps(); |
1258 | 0 | const __m256 neg_x = _mm256_sub_ps(zero, x); |
1259 | 0 | const __m256 exp_neg_x = ggml_v_expf(neg_x); |
1260 | 0 | const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); |
1261 | 0 | return _mm256_div_ps(x, one_plus_exp_neg_x); |
1262 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_v_silu Unexecuted instantiation: vec.cpp:ggml_v_silu(float __vector(8)) Unexecuted instantiation: ops.cpp:ggml_v_silu(float __vector(8)) |
1263 | | |
1264 | | #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON |
1265 | | |
1266 | | #if defined(__FMA__) |
1267 | | #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) |
1268 | | #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) |
1269 | | #else |
1270 | | #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) |
1271 | | #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) |
1272 | | #endif |
1273 | | |
1274 | | // adapted from arm limited optimized routine |
1275 | | // the maximum error is 1.45358 plus 0.5 ulps |
1276 | | // numbers above 88.38 will flush to infinity |
1277 | | // numbers beneath -103.97 will flush to zero |
1278 | | inline static __m128 ggml_v_expf(__m128 x) { |
1279 | | const __m128 r = _mm_set1_ps(0x1.8p23f); |
1280 | | const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); |
1281 | | const __m128 n = _mm_sub_ps(z, r); |
1282 | | const __m128 b = |
1283 | | NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); |
1284 | | const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); |
1285 | | const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); |
1286 | | const __m128i c = |
1287 | | _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); |
1288 | | const __m128 u = _mm_mul_ps(b, b); |
1289 | | const __m128 j = |
1290 | | MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, |
1291 | | MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), |
1292 | | u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); |
1293 | | if (!_mm_movemask_epi8(c)) |
1294 | | return MADD128(j, k, k); |
1295 | | const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), |
1296 | | _mm_set1_epi32(0x82000000u)); |
1297 | | const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); |
1298 | | const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); |
1299 | | const __m128i d = |
1300 | | _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); |
1301 | | return _mm_or_ps( |
1302 | | _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), |
1303 | | _mm_andnot_ps(_mm_castsi128_ps(d), |
1304 | | _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), |
1305 | | _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); |
1306 | | } |
1307 | | |
1308 | | // computes silu x/(1+exp(-x)) in single precision vector |
1309 | | inline static __m128 ggml_v_silu(__m128 x) { |
1310 | | const __m128 one = _mm_set1_ps(1); |
1311 | | const __m128 zero = _mm_setzero_ps(); |
1312 | | const __m128 neg_x = _mm_sub_ps(zero, x); |
1313 | | const __m128 exp_neg_x = ggml_v_expf(neg_x); |
1314 | | const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); |
1315 | | return _mm_div_ps(x, one_plus_exp_neg_x); |
1316 | | } |
1317 | | |
1318 | | #elif defined(__riscv_v_intrinsic) |
1319 | | |
1320 | | // adapted from arm limited optimized routine |
1321 | | // the maximum error is 1.45358 plus 0.5 ulps |
1322 | | // numbers above 88.38 will flush to infinity |
1323 | | // numbers beneath -103.97 will flush to zero |
1324 | | inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) { |
1325 | | const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl); |
1326 | | #ifdef __riscv_xtheadvector |
1327 | | // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4') |
1328 | | vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl); |
1329 | | z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl); |
1330 | | #else |
1331 | | const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl); |
1332 | | #endif |
1333 | | const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl); |
1334 | | const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl), |
1335 | | 0x1.7f7d1cp-20f, n, vl); |
1336 | | const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl); |
1337 | | const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f |
1338 | | const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl); |
1339 | | const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl); |
1340 | | const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2( |
1341 | | __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl), |
1342 | | __riscv_vfmacc_vv_f32m2( |
1343 | | __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl), |
1344 | | __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl), |
1345 | | u, vl), u, vl); |
1346 | | if (!__riscv_vcpop_m_b16(c, vl)) |
1347 | | return __riscv_vfmacc_vv_f32m2(k, j, k, vl); |
1348 | | const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl); |
1349 | | const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl); |
1350 | | const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl)); |
1351 | | const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl)); |
1352 | | const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2( |
1353 | | __riscv_vfmacc_vv_f32m2(k, k, j, vl), |
1354 | | __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl), |
1355 | | c, vl); |
1356 | | return __riscv_vmerge_vvm_f32m2( |
1357 | | r1, __riscv_vfmul_vv_f32m2(s1, s1, vl), |
1358 | | __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl), |
1359 | | vl); |
1360 | | } |
1361 | | |
1362 | | // computes silu x/(1+exp(-x)) in single precision vector |
1363 | | inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) { |
1364 | | const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl); |
1365 | | const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl); |
1366 | | const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl); |
1367 | | return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl); |
1368 | | } |
1369 | | |
1370 | | #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic |
1371 | | |
1372 | 0 | inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
1373 | 0 | for (int i = 0; i < n; ++i) { |
1374 | 0 | y[i] = ggml_silu_f16(x[i]); |
1375 | 0 | } |
1376 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_silu_f16(int, unsigned short*, unsigned short const*) |
1377 | | |
1378 | 0 | inline static float ggml_silu_backward_f32(float x, float dy) { |
1379 | 0 | const float s = 1.0f/(1.0f + expf(-x)); |
1380 | 0 | return dy*s*(1.0f + x*(1.0f - s)); |
1381 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f32 Unexecuted instantiation: vec.cpp:ggml_silu_backward_f32(float, float) Unexecuted instantiation: ops.cpp:ggml_silu_backward_f32(float, float) |
1382 | | |
1383 | 0 | inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { |
1384 | 0 | const float v = GGML_CPU_FP16_TO_FP32(x); |
1385 | 0 | const float s = 1.0f/(1.0f + expf(-v)); |
1386 | 0 | return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); |
1387 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_silu_backward_f16 Unexecuted instantiation: vec.cpp:ggml_silu_backward_f16(unsigned short, unsigned short) Unexecuted instantiation: ops.cpp:ggml_silu_backward_f16(unsigned short, unsigned short) |
1388 | | |
1389 | 0 | inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { |
1390 | 0 | for (int i = 0; i < n; ++i) { |
1391 | 0 | dx[i] = ggml_silu_backward_f32(x[i], dy[i]); |
1392 | 0 | } |
1393 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f32 Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f32(int, float*, float const*, float const*) |
1394 | | |
1395 | 0 | inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) { |
1396 | 0 | for (int i = 0; i < n; ++i) { |
1397 | 0 | dx[i] = ggml_silu_backward_f16(x[i], dy[i]); |
1398 | 0 | } |
1399 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_silu_backward_f16 Unexecuted instantiation: vec.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_silu_backward_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1400 | | |
1401 | 0 | inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) { |
1402 | 0 | for (int i = 0; i < n; ++i) { |
1403 | 0 | y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f; |
1404 | 0 | } |
1405 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f32(int, float*, float const*, float const*) |
1406 | | |
1407 | 0 | inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
1408 | 0 | for (int i = 0; i < n; ++i) { |
1409 | 0 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
1410 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f); |
1411 | 0 | } |
1412 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_reglu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_reglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1413 | | |
1414 | | #ifdef GGML_GELU_FP16 |
1415 | 0 | inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { |
1416 | 0 | uint16_t t; |
1417 | 0 | for (int i = 0; i < n; ++i) { |
1418 | 0 | if (x[i] <= -10.0f) { |
1419 | 0 | y[i] = 0.0f; |
1420 | 0 | } else if (x[i] >= 10.0f) { |
1421 | 0 | y[i] = x[i] * g[i]; |
1422 | 0 | } else { |
1423 | 0 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
1424 | 0 | memcpy(&t, &fp16, sizeof(uint16_t)); |
1425 | 0 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; |
1426 | 0 | } |
1427 | 0 | } |
1428 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f32 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f32(int, float*, float const*, float const*) |
1429 | | #else |
1430 | | inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { |
1431 | | for (int i = 0; i < n; ++i) { |
1432 | | y[i] = ggml_gelu_f32(x[i]) * g[i]; |
1433 | | } |
1434 | | } |
1435 | | #endif |
1436 | | |
1437 | 0 | inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
1438 | 0 | const uint16_t * i16 = (const uint16_t *) x; |
1439 | 0 | for (int i = 0; i < n; ++i) { |
1440 | 0 | float v = GGML_CPU_FP16_TO_FP32(g[i]); |
1441 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); |
1442 | 0 | } |
1443 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1444 | | |
1445 | | void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g); |
1446 | | |
1447 | 0 | inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
1448 | 0 | for (int i = 0; i < n; ++i) { |
1449 | 0 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
1450 | 0 | float gi = GGML_CPU_FP16_TO_FP32(g[i]); |
1451 | 0 | y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi); |
1452 | 0 | } |
1453 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_swiglu_f16 Unexecuted instantiation: vec.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_swiglu_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1454 | | |
1455 | 0 | inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) { |
1456 | 0 | for (int i = 0; i < n; ++i) { |
1457 | 0 | float xi = x[i]; |
1458 | 0 | y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i]; |
1459 | 0 | } |
1460 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f32 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f32(int, float*, float const*, float const*) |
1461 | | |
1462 | 0 | inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
1463 | 0 | for (int i = 0; i < n; ++i) { |
1464 | 0 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
1465 | 0 | float gi = GGML_CPU_FP16_TO_FP32(g[i]); |
1466 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi); |
1467 | 0 | } |
1468 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_erf_f16 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_erf_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1469 | | |
1470 | | #ifdef GGML_GELU_QUICK_FP16 |
1471 | 0 | inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { |
1472 | 0 | uint16_t t; |
1473 | 0 | for (int i = 0; i < n; ++i) { |
1474 | 0 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
1475 | 0 | memcpy(&t, &fp16, sizeof(uint16_t)); |
1476 | 0 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i]; |
1477 | 0 | } |
1478 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f32 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f32(int, float*, float const*, float const*) |
1479 | | #else |
1480 | | inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { |
1481 | | for (int i = 0; i < n; ++i) { |
1482 | | y[i] = ggml_gelu_quick_f32(x[i]) * g[i]; |
1483 | | } |
1484 | | } |
1485 | | #endif |
1486 | | |
1487 | 0 | inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
1488 | 0 | const uint16_t * i16 = (const uint16_t *) x; |
1489 | 0 | for (int i = 0; i < n; ++i) { |
1490 | 0 | float v = GGML_CPU_FP16_TO_FP32(g[i]); |
1491 | 0 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v); |
1492 | 0 | } |
1493 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_geglu_quick_f16 Unexecuted instantiation: vec.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_geglu_quick_f16(int, unsigned short*, unsigned short const*, unsigned short const*) |
1494 | | |
1495 | 0 | inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { |
1496 | 0 | #ifndef GGML_USE_ACCELERATE |
1497 | 0 | ggml_float sum = 0.0; |
1498 | 0 | for (int i = 0; i < n; ++i) { |
1499 | 0 | sum += (ggml_float)x[i]; |
1500 | 0 | } |
1501 | 0 | *s = (float)sum; |
1502 | | #else |
1503 | | vDSP_sve(x, 1, s, n); |
1504 | | #endif |
1505 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32 Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32(int, float*, float const*) |
1506 | | |
1507 | 0 | inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) { |
1508 | 0 | for (int i = 0; i < n; ++i) { |
1509 | 0 | if (i == 0) { |
1510 | 0 | y[i] = x[i]; |
1511 | 0 | } else { |
1512 | 0 | y[i] = y[i - 1] + x[i]; |
1513 | 0 | } |
1514 | 0 | } |
1515 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_cumsum_f32 Unexecuted instantiation: vec.cpp:ggml_vec_cumsum_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_cumsum_f32(int, float*, float const*) |
1516 | | |
1517 | 0 | inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { |
1518 | 0 | ggml_float sum = 0.0; |
1519 | 0 | for (int i = 0; i < n; ++i) { |
1520 | 0 | sum += (ggml_float)x[i]; |
1521 | 0 | } |
1522 | 0 | *s = sum; |
1523 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f32_ggf Unexecuted instantiation: vec.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_sum_f32_ggf(int, double*, float const*) |
1524 | | |
1525 | 0 | inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { |
1526 | 0 | float sum = 0.0f; |
1527 | 0 | for (int i = 0; i < n; ++i) { |
1528 | 0 | sum += GGML_CPU_FP16_TO_FP32(x[i]); |
1529 | 0 | } |
1530 | 0 | *s = sum; |
1531 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_f16_ggf Unexecuted instantiation: vec.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*) Unexecuted instantiation: ops.cpp:ggml_vec_sum_f16_ggf(int, float*, unsigned short const*) |
1532 | | |
1533 | 0 | inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { |
1534 | 0 | float sum = 0.0f; |
1535 | 0 | for (int i = 0; i < n; ++i) { |
1536 | 0 | sum += GGML_BF16_TO_FP32(x[i]); |
1537 | 0 | } |
1538 | 0 | *s = sum; |
1539 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_sum_bf16_ggf Unexecuted instantiation: vec.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*) Unexecuted instantiation: ops.cpp:ggml_vec_sum_bf16_ggf(int, float*, ggml_bf16_t const*) |
1540 | | |
1541 | 0 | inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { |
1542 | 0 | #ifndef GGML_USE_ACCELERATE |
1543 | 0 | float max = -INFINITY; |
1544 | 0 | for (int i = 0; i < n; ++i) { |
1545 | 0 | max = MAX(max, x[i]); |
1546 | 0 | } |
1547 | 0 | *s = max; |
1548 | | #else |
1549 | | vDSP_maxv(x, 1, s, n); |
1550 | | #endif |
1551 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_max_f32 Unexecuted instantiation: vec.cpp:ggml_vec_max_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_max_f32(int, float*, float const*) |
1552 | | |
1553 | 0 | inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { |
1554 | 0 | ggml_vec_norm_f32(n, s, x); |
1555 | 0 | *s = 1.f/(*s); |
1556 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_norm_inv_f32 Unexecuted instantiation: vec.cpp:ggml_vec_norm_inv_f32(int, float*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_norm_inv_f32(int, float*, float const*) |
1557 | | |
1558 | 0 | inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { |
1559 | 0 | float max = -INFINITY; |
1560 | 0 | int idx = 0; |
1561 | 0 | for (int i = 0; i < n; ++i) { |
1562 | 0 | max = MAX(max, x[i]); |
1563 | 0 | if (max == x[i]) { idx = i; } |
1564 | 0 | } |
1565 | 0 | *s = idx; |
1566 | 0 | } Unexecuted instantiation: ggml-cpu.c:ggml_vec_argmax_f32 Unexecuted instantiation: vec.cpp:ggml_vec_argmax_f32(int, int*, float const*) Unexecuted instantiation: ops.cpp:ggml_vec_argmax_f32(int, int*, float const*) |
1567 | | |
1568 | | #ifdef __cplusplus |
1569 | | } |
1570 | | #endif |