/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "ggml-cpu-impl.h" |
4 | | |
5 | | #ifdef __ARM_FEATURE_SVE |
6 | | #include <arm_sve.h> |
7 | | #endif // __ARM_FEATURE_SVE |
8 | | |
9 | | #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
10 | | // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
11 | | // |
12 | | // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
13 | | // |
14 | | #include <arm_neon.h> |
15 | | #endif |
16 | | |
17 | | #if defined(__riscv_v_intrinsic) |
18 | | #include <riscv_vector.h> |
19 | | #endif |
20 | | |
21 | | #ifdef __cplusplus |
22 | | extern "C" { |
23 | | #endif |
24 | | |
25 | | // |
26 | | // simd mappings |
27 | | // |
28 | | |
29 | | // FP16 to FP32 conversion |
30 | | |
31 | | // 16-bit float |
32 | | // on Arm, we use __fp16 |
33 | | // on x86, we use uint16_t |
34 | | // |
35 | | // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 |
36 | | // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 |
37 | | // |
38 | | #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
39 | | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) |
40 | | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) |
41 | | |
42 | | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
43 | | |
44 | | static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { |
45 | | __fp16 tmp; |
46 | | memcpy(&tmp, &h, sizeof(ggml_fp16_t)); |
47 | | return (float)tmp; |
48 | | } |
49 | | |
50 | | static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { |
51 | | ggml_fp16_t res; |
52 | | __fp16 tmp = f; |
53 | | memcpy(&res, &tmp, sizeof(ggml_fp16_t)); |
54 | | return res; |
55 | | } |
56 | | #elif defined(__F16C__) |
57 | | #ifdef _MSC_VER |
58 | | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) |
59 | | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) |
60 | | #else |
61 | | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) |
62 | | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) |
63 | | #endif |
64 | | #elif defined(__POWER9_VECTOR__) |
65 | | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) |
66 | | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) |
67 | | /* the inline asm below is about 12% faster than the lookup method */ |
68 | | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
69 | | #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) |
70 | | |
71 | | static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { |
72 | | float f; |
73 | | double d; |
74 | | __asm__( |
75 | | "mtfprd %0,%2\n" |
76 | | "xscvhpdp %0,%0\n" |
77 | | "frsp %1,%0\n" : |
78 | | /* temp */ "=d"(d), |
79 | | /* out */ "=f"(f): |
80 | | /* in */ "r"(h)); |
81 | | return f; |
82 | | } |
83 | | |
84 | | static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { |
85 | | double d; |
86 | | ggml_fp16_t r; |
87 | | __asm__( /* xscvdphp can work on double or single precision */ |
88 | | "xscvdphp %0,%2\n" |
89 | | "mffprd %1,%0\n" : |
90 | | /* temp */ "=d"(d), |
91 | | /* out */ "=r"(r): |
92 | | /* in */ "f"(f)); |
93 | | return r; |
94 | | } |
95 | | #elif defined(__riscv) && defined(__riscv_zfhmin) |
96 | | static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { |
97 | | _Float16 hf; |
98 | | memcpy(&hf, &h, sizeof(ggml_fp16_t)); |
99 | | return hf; |
100 | | } |
101 | | |
102 | | static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { |
103 | | ggml_fp16_t res; |
104 | | _Float16 hf = (_Float16)f; |
105 | | memcpy(&res, &hf, sizeof(ggml_fp16_t)); |
106 | | return res; |
107 | | } |
108 | | |
109 | | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) |
110 | | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) |
111 | | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
112 | | #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) |
113 | | #endif |
114 | | |
115 | | // precomputed f32 table for f16 (256 KB) |
116 | | // defined in ggml-cpu.c, initialized in ggml_cpu_init() |
117 | | extern float ggml_table_f32_f16[1 << 16]; |
118 | | |
119 | | // precomputed f32 table for e8m0 half (1 KB) |
120 | | // defined in ggml-cpu.c, initialized in ggml_cpu_init() |
121 | | extern float ggml_table_f32_e8m0_half[1 << 8]; |
122 | | |
123 | | // Use lookup table for E8M0 on x86 (faster than bit manipulation) |
124 | | #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) |
125 | 0 | #define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)] |
126 | | #else |
127 | | #define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x) |
128 | | #endif |
129 | | |
130 | | // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, |
131 | | // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. |
132 | | // This is also true for POWER9. |
133 | | #if !defined(GGML_CPU_FP16_TO_FP32) |
134 | 0 | inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { |
135 | 0 | uint16_t s; |
136 | 0 | memcpy(&s, &f, sizeof(uint16_t)); |
137 | 0 | return ggml_table_f32_f16[s]; |
138 | 0 | } Unexecuted instantiation: repack.cpp:ggml_lookup_fp16_to_fp32(unsigned short) Unexecuted instantiation: ggml-cpu.c:ggml_lookup_fp16_to_fp32 Unexecuted instantiation: quants.c:ggml_lookup_fp16_to_fp32 Unexecuted instantiation: binary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short) Unexecuted instantiation: unary-ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short) Unexecuted instantiation: vec.cpp:ggml_lookup_fp16_to_fp32(unsigned short) Unexecuted instantiation: ops.cpp:ggml_lookup_fp16_to_fp32(unsigned short) Unexecuted instantiation: sgemm.cpp:ggml_lookup_fp16_to_fp32(unsigned short) |
139 | | |
140 | 0 | #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) |
141 | | #endif |
142 | | |
143 | | #if !defined(GGML_CPU_FP32_TO_FP16) |
144 | 131k | #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
145 | | #endif |
146 | | |
147 | | |
148 | | // we define a common set of C macros which map to specific intrinsics based on the current architecture |
149 | | // we then implement the fundamental computation operations below using only these macros |
150 | | // adding support for new architectures requires to define the corresponding SIMD macros |
151 | | // |
152 | | // GGML_F32_STEP / GGML_F16_STEP |
153 | | // number of elements to process in a single step |
154 | | // |
155 | | // GGML_F32_EPR / GGML_F16_EPR |
156 | | // number of elements to fit in a single register |
157 | | // |
158 | | |
159 | | #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA) |
160 | | |
161 | | #define GGML_SIMD |
162 | | |
163 | | // F32 SVE |
164 | | #define GGML_F32_EPR 8 |
165 | | #define DEFAULT_PG svptrue_b32() |
166 | | |
167 | | #define GGML_F32xt svfloat32_t |
168 | | #define GGML_F32xt_ZERO svdup_n_f32(0.0f) |
169 | | #define GGML_F32xt_SET1(x) svdup_n_f32(x) |
170 | | #define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a) |
171 | | #define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a) |
172 | | #define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b) |
173 | | #define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b) |
174 | | #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a) |
175 | | #define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c) |
176 | | #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b) |
177 | | #define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b) |
178 | | #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b) |
179 | | #define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b) |
180 | | #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a) |
181 | | #define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a) |
182 | | #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ |
183 | | { \ |
184 | | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \ |
185 | | sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \ |
186 | | sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \ |
187 | | sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \ |
188 | | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \ |
189 | | sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \ |
190 | | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \ |
191 | | (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \ |
192 | | } |
193 | | #define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ |
194 | | GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) |
195 | | |
196 | | #define GGML_F32_VEC GGML_F32xt |
197 | | #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO |
198 | | #define GGML_F32_VEC_SET1 GGML_F32xt_SET1 |
199 | | #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD |
200 | | #define GGML_F32_VEC_STORE GGML_F32xt_STORE |
201 | | #define GGML_F32_VEC_FMA GGML_F32xt_FMA |
202 | | #define GGML_F32_VEC_ADD GGML_F32xt_ADD |
203 | | #define GGML_F32_VEC_MUL GGML_F32xt_MUL |
204 | | #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE |
205 | | |
206 | | // F16 SVE |
207 | | #define DEFAULT_PG32 svptrue_b32() |
208 | | #define DEFAULT_PG16 svptrue_b16() |
209 | | |
210 | | #define GGML_F32Cxt svfloat16_t |
211 | | #define GGML_F32Cxt_ZERO svdup_n_f16(0.0f) |
212 | | #define GGML_F32Cxt_SET1(x) svdup_n_f16(x) |
213 | | #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p)) |
214 | | #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) |
215 | | |
216 | | #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a) |
217 | | #define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c) |
218 | | #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) |
219 | | #define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b) |
220 | | #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b) |
221 | | #define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b) |
222 | | #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED |
223 | | |
224 | | #define GGML_F16x_VEC GGML_F32Cxt |
225 | | #define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO |
226 | | #define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1 |
227 | | #define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p) |
228 | | #define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r) |
229 | | #define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA |
230 | | #define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD |
231 | | #define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL |
232 | | #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE |
233 | | |
234 | | #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a) |
235 | | #define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a) |
236 | | |
237 | | #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \ |
238 | | { \ |
239 | | sum1 = svadd_f16_x(pg16, sum1, sum2); \ |
240 | | sum3 = svadd_f16_x(pg16, sum3, sum4); \ |
241 | | sum1 = svadd_f16_x(pg16, sum1, sum3); \ |
242 | | __fp16 sum_f16 = svaddv_f16(pg16, sum1); \ |
243 | | (res) = (ggml_float) sum_f16; \ |
244 | | } |
245 | | #define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \ |
246 | | GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4) |
247 | | |
248 | | // F16 NEON |
249 | | |
250 | | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
251 | | #define GGML_F16_STEP 32 |
252 | | #define GGML_F16_EPR 8 |
253 | | |
254 | | #define GGML_F16x8 float16x8_t |
255 | | #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) |
256 | | #define GGML_F16x8_SET1(x) vdupq_n_f16(x) |
257 | | #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) |
258 | | #define GGML_F16x8_STORE vst1q_f16 |
259 | | #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |
260 | | #define GGML_F16x8_ADD vaddq_f16 |
261 | | #define GGML_F16x8_MUL vmulq_f16 |
262 | | #define GGML_F16x8_REDUCE(res, x) \ |
263 | | do { \ |
264 | | int offset = GGML_F16_ARR >> 1; \ |
265 | | for (int i = 0; i < offset; ++i) { \ |
266 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
267 | | } \ |
268 | | offset >>= 1; \ |
269 | | for (int i = 0; i < offset; ++i) { \ |
270 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
271 | | } \ |
272 | | offset >>= 1; \ |
273 | | for (int i = 0; i < offset; ++i) { \ |
274 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
275 | | } \ |
276 | | const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ |
277 | | const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ |
278 | | (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ |
279 | | } while (0) |
280 | | |
281 | | #define GGML_F16_VEC GGML_F16x8 |
282 | | #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO |
283 | | #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 |
284 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) |
285 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) |
286 | | #define GGML_F16_VEC_FMA GGML_F16x8_FMA |
287 | | #define GGML_F16_VEC_ADD GGML_F16x8_ADD |
288 | | #define GGML_F16_VEC_MUL GGML_F16x8_MUL |
289 | | #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE |
290 | | #else |
291 | | // if FP16 vector arithmetic is not supported, we use FP32 instead |
292 | | // and take advantage of the vcvt_ functions to convert to/from FP16 |
293 | | |
294 | | #define GGML_F16_STEP 16 |
295 | | #define GGML_F16_EPR 4 |
296 | | |
297 | | #define GGML_F32Cx4 float32x4_t |
298 | | #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) |
299 | | #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) |
300 | | #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) |
301 | | #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) |
302 | | #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) |
303 | | #define GGML_F32Cx4_ADD vaddq_f32 |
304 | | #define GGML_F32Cx4_MUL vmulq_f32 |
305 | | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
306 | | |
307 | | #define GGML_F16_VEC GGML_F32Cx4 |
308 | | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
309 | | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
310 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
311 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) |
312 | | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
313 | | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
314 | | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
315 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
316 | | #endif |
317 | | |
318 | | #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) |
319 | | |
320 | | #define GGML_SIMD |
321 | | |
322 | | // F32 NEON |
323 | | |
324 | | #define GGML_F32_STEP 16 |
325 | | #define GGML_F32_EPR 4 |
326 | | |
327 | | #define GGML_F32x4 float32x4_t |
328 | | #define GGML_F32x4_ZERO vdupq_n_f32(0.0f) |
329 | | #define GGML_F32x4_SET1(x) vdupq_n_f32(x) |
330 | | #define GGML_F32x4_LOAD vld1q_f32 |
331 | | #define GGML_F32x4_STORE vst1q_f32 |
332 | | #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) |
333 | | #define GGML_F32x4_ADD vaddq_f32 |
334 | | #define GGML_F32x4_MUL vmulq_f32 |
335 | | #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) |
336 | | #define GGML_F32x4_REDUCE(res, x) \ |
337 | | { \ |
338 | | int offset = GGML_F32_ARR >> 1; \ |
339 | | for (int i = 0; i < offset; ++i) { \ |
340 | | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
341 | | } \ |
342 | | offset >>= 1; \ |
343 | | for (int i = 0; i < offset; ++i) { \ |
344 | | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
345 | | } \ |
346 | | offset >>= 1; \ |
347 | | for (int i = 0; i < offset; ++i) { \ |
348 | | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
349 | | } \ |
350 | | (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \ |
351 | | } |
352 | | |
353 | | #define GGML_F32_VEC GGML_F32x4 |
354 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
355 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
356 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
357 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
358 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
359 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
360 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
361 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
362 | | |
363 | | // F16 NEON |
364 | | |
365 | | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
366 | | #define GGML_F16_STEP 32 |
367 | | #define GGML_F16_EPR 8 |
368 | | |
369 | | #define GGML_F16x8 float16x8_t |
370 | | #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) |
371 | | #define GGML_F16x8_SET1(x) vdupq_n_f16(x) |
372 | | #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) |
373 | | #define GGML_F16x8_STORE vst1q_f16 |
374 | | #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |
375 | | #define GGML_F16x8_ADD vaddq_f16 |
376 | | #define GGML_F16x8_MUL vmulq_f16 |
377 | | #define GGML_F16x8_REDUCE(res, x) \ |
378 | | do { \ |
379 | | int offset = GGML_F16_ARR >> 1; \ |
380 | | for (int i = 0; i < offset; ++i) { \ |
381 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
382 | | } \ |
383 | | offset >>= 1; \ |
384 | | for (int i = 0; i < offset; ++i) { \ |
385 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
386 | | } \ |
387 | | offset >>= 1; \ |
388 | | for (int i = 0; i < offset; ++i) { \ |
389 | | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
390 | | } \ |
391 | | const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ |
392 | | const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ |
393 | | (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ |
394 | | } while (0) |
395 | | |
396 | | #define GGML_F16_VEC GGML_F16x8 |
397 | | #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO |
398 | | #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 |
399 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) |
400 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) |
401 | | #define GGML_F16_VEC_FMA GGML_F16x8_FMA |
402 | | #define GGML_F16_VEC_ADD GGML_F16x8_ADD |
403 | | #define GGML_F16_VEC_MUL GGML_F16x8_MUL |
404 | | #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE |
405 | | #else |
406 | | // if FP16 vector arithmetic is not supported, we use FP32 instead |
407 | | // and take advantage of the vcvt_ functions to convert to/from FP16 |
408 | | |
409 | | #define GGML_F16_STEP 16 |
410 | | #define GGML_F16_EPR 4 |
411 | | |
412 | | #define GGML_F32Cx4 float32x4_t |
413 | | #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) |
414 | | #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) |
415 | | #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) |
416 | | #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) |
417 | | #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) |
418 | | #define GGML_F32Cx4_ADD vaddq_f32 |
419 | | #define GGML_F32Cx4_MUL vmulq_f32 |
420 | | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
421 | | |
422 | | #define GGML_F16_VEC GGML_F32Cx4 |
423 | | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
424 | | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
425 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
426 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) |
427 | | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
428 | | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
429 | | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
430 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
431 | | #endif |
432 | | |
433 | | #elif defined(__AVX512F__) |
434 | | |
435 | | #define GGML_SIMD |
436 | | |
437 | | // F32 AVX512 |
438 | | |
439 | | #define GGML_F32_STEP 64 |
440 | | #define GGML_F32_EPR 16 |
441 | | |
442 | | #define GGML_F32x16 __m512 |
443 | | #define GGML_F32x16_ZERO _mm512_setzero_ps() |
444 | | #define GGML_F32x16_SET1(x) _mm512_set1_ps(x) |
445 | | #define GGML_F32x16_LOAD _mm512_loadu_ps |
446 | | #define GGML_F32x16_STORE _mm512_storeu_ps |
447 | | // _mm512_fmadd_ps is defined in AVX512F so no guard is required |
448 | | #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) |
449 | | #define GGML_F32x16_ADD _mm512_add_ps |
450 | | #define GGML_F32x16_MUL _mm512_mul_ps |
451 | | #define GGML_F32x16_REDUCE(res, x) \ |
452 | | do { \ |
453 | | int offset = GGML_F32_ARR >> 1; \ |
454 | | for (int i = 0; i < offset; ++i) { \ |
455 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
456 | | } \ |
457 | | offset >>= 1; \ |
458 | | for (int i = 0; i < offset; ++i) { \ |
459 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
460 | | } \ |
461 | | offset >>= 1; \ |
462 | | for (int i = 0; i < offset; ++i) { \ |
463 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
464 | | } \ |
465 | | res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ |
466 | | } while (0) |
467 | | |
468 | | // TODO: is this optimal ? |
469 | | |
470 | | #define GGML_F32_VEC GGML_F32x16 |
471 | | #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO |
472 | | #define GGML_F32_VEC_SET1 GGML_F32x16_SET1 |
473 | | #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD |
474 | | #define GGML_F32_VEC_STORE GGML_F32x16_STORE |
475 | | #define GGML_F32_VEC_FMA GGML_F32x16_FMA |
476 | | #define GGML_F32_VEC_ADD GGML_F32x16_ADD |
477 | | #define GGML_F32_VEC_MUL GGML_F32x16_MUL |
478 | | #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE |
479 | | |
480 | | // F16 AVX512 |
481 | | |
482 | | #if defined(__AVX512FP16__) |
483 | | |
484 | | #define GGML_F16_STEP 128 |
485 | | #define GGML_F16_EPR 32 |
486 | | |
487 | | #define GGML_F16x32 __m512h |
488 | | #define GGML_F16x32_ZERO _mm512_setzero_ph() |
489 | | #define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x)) |
490 | | #define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x) |
491 | | #define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y) |
492 | | #define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a) |
493 | | #define GGML_F16x32_ADD _mm512_add_ph |
494 | | #define GGML_F16x32_MUL _mm512_mul_ph |
495 | | #define GGML_F16x32_REDUCE(res, x) \ |
496 | | do { \ |
497 | | int offset = GGML_F16_ARR >> 1; \ |
498 | | for (int i = 0; i < offset; ++i) { \ |
499 | | x[i] = _mm512_add_ph(x[i], x[offset+i]); \ |
500 | | } \ |
501 | | offset >>= 1; \ |
502 | | for (int i = 0; i < offset; ++i) { \ |
503 | | x[i] = _mm512_add_ph(x[i], x[offset+i]); \ |
504 | | } \ |
505 | | offset >>= 1; \ |
506 | | for (int i = 0; i < offset; ++i) { \ |
507 | | x[i] = _mm512_add_ph(x[i], x[offset+i]); \ |
508 | | } \ |
509 | | res = (ggml_float) _mm512_reduce_add_ph(x[0]); \ |
510 | | } while (0) |
511 | | |
512 | | #define GGML_F16_VEC GGML_F16x32 |
513 | | #define GGML_F16_VEC_ZERO GGML_F16x32_ZERO |
514 | | #define GGML_F16_VEC_SET1 GGML_F16x32_SET1 |
515 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p) |
516 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i]) |
517 | | #define GGML_F16_VEC_FMA GGML_F16x32_FMA |
518 | | #define GGML_F16_VEC_ADD GGML_F16x32_ADD |
519 | | #define GGML_F16_VEC_MUL GGML_F16x32_MUL |
520 | | #define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE |
521 | | |
522 | | #else // Fallback FP16 <-> FP32 |
523 | | |
524 | | #define GGML_F16_STEP 64 |
525 | | #define GGML_F16_EPR 16 |
526 | | |
527 | | #define GGML_F32Cx16 __m512 |
528 | | #define GGML_F32Cx16_ZERO _mm512_setzero_ps() |
529 | | #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) |
530 | | |
531 | | // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F |
532 | | // so F16C guard isn't required |
533 | | #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) |
534 | | #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) |
535 | | |
536 | | #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) |
537 | | #define GGML_F32Cx16_ADD _mm512_add_ps |
538 | | #define GGML_F32Cx16_MUL _mm512_mul_ps |
539 | | #define GGML_F32Cx16_REDUCE(res, x) \ |
540 | | do { \ |
541 | | int offset = GGML_F32_ARR >> 1; \ |
542 | | for (int i = 0; i < offset; ++i) { \ |
543 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
544 | | } \ |
545 | | offset >>= 1; \ |
546 | | for (int i = 0; i < offset; ++i) { \ |
547 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
548 | | } \ |
549 | | offset >>= 1; \ |
550 | | for (int i = 0; i < offset; ++i) { \ |
551 | | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
552 | | } \ |
553 | | res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ |
554 | | } while (0) |
555 | | |
556 | | #define GGML_F16_VEC GGML_F32Cx16 |
557 | | #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO |
558 | | #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 |
559 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) |
560 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) |
561 | | #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA |
562 | | #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD |
563 | | #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL |
564 | | |
565 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE |
566 | | |
567 | | #endif // __AVX512FP16__ |
568 | | #elif defined(__AVX__) |
569 | | |
570 | | #define GGML_SIMD |
571 | | |
572 | | // F32 AVX |
573 | | |
574 | 0 | #define GGML_F32_STEP 32 |
575 | 0 | #define GGML_F32_EPR 8 |
576 | | |
577 | 0 | #define GGML_F32x8 __m256 |
578 | 0 | #define GGML_F32x8_ZERO _mm256_setzero_ps() |
579 | 0 | #define GGML_F32x8_SET1(x) _mm256_set1_ps(x) |
580 | 0 | #define GGML_F32x8_LOAD _mm256_loadu_ps |
581 | 0 | #define GGML_F32x8_STORE _mm256_storeu_ps |
582 | | #if defined(__FMA__) |
583 | 0 | #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) |
584 | | #else |
585 | | #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) |
586 | | #endif |
587 | 0 | #define GGML_F32x8_ADD _mm256_add_ps |
588 | 0 | #define GGML_F32x8_MUL _mm256_mul_ps |
589 | 0 | #define GGML_F32x8_REDUCE(res, x) \ |
590 | 0 | do { \ |
591 | 0 | int offset = GGML_F32_ARR >> 1; \ |
592 | 0 | for (int i = 0; i < offset; ++i) { \ |
593 | 0 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
594 | 0 | } \ |
595 | 0 | offset >>= 1; \ |
596 | 0 | for (int i = 0; i < offset; ++i) { \ |
597 | 0 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
598 | 0 | } \ |
599 | 0 | offset >>= 1; \ |
600 | 0 | for (int i = 0; i < offset; ++i) { \ |
601 | 0 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
602 | 0 | } \ |
603 | 0 | const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ |
604 | 0 | _mm256_extractf128_ps(x[0], 1)); \ |
605 | 0 | const __m128 t1 = _mm_hadd_ps(t0, t0); \ |
606 | 0 | res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ |
607 | 0 | } while (0) |
608 | | // TODO: is this optimal ? |
609 | | |
610 | 0 | #define GGML_F32_VEC GGML_F32x8 |
611 | 0 | #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO |
612 | 0 | #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 |
613 | 0 | #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD |
614 | 0 | #define GGML_F32_VEC_STORE GGML_F32x8_STORE |
615 | 0 | #define GGML_F32_VEC_FMA GGML_F32x8_FMA |
616 | 0 | #define GGML_F32_VEC_ADD GGML_F32x8_ADD |
617 | 0 | #define GGML_F32_VEC_MUL GGML_F32x8_MUL |
618 | 0 | #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE |
619 | | |
620 | | // F16 AVX |
621 | | |
622 | 0 | #define GGML_F16_STEP 32 |
623 | 0 | #define GGML_F16_EPR 8 |
624 | | |
625 | | // F16 arithmetic is not supported by AVX, so we use F32 instead |
626 | | |
627 | 0 | #define GGML_F32Cx8 __m256 |
628 | 0 | #define GGML_F32Cx8_ZERO _mm256_setzero_ps() |
629 | 0 | #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) |
630 | | |
631 | | #if defined(__F16C__) |
632 | | // the _mm256_cvt intrinsics require F16C |
633 | 0 | #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) |
634 | 0 | #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) |
635 | | #else |
636 | | static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { |
637 | | float tmp[8]; |
638 | | |
639 | | for (int i = 0; i < 8; i++) { |
640 | | tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); |
641 | | } |
642 | | |
643 | | return _mm256_loadu_ps(tmp); |
644 | | } |
645 | | static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { |
646 | | float arr[8]; |
647 | | |
648 | | _mm256_storeu_ps(arr, y); |
649 | | |
650 | | for (int i = 0; i < 8; i++) |
651 | | x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); |
652 | | } |
653 | | #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) |
654 | | #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) |
655 | | #endif |
656 | | |
657 | 0 | #define GGML_F32Cx8_FMA GGML_F32x8_FMA |
658 | | #define GGML_F32Cx8_ADD _mm256_add_ps |
659 | 0 | #define GGML_F32Cx8_MUL _mm256_mul_ps |
660 | 0 | #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE |
661 | | |
662 | 0 | #define GGML_F16_VEC GGML_F32Cx8 |
663 | 0 | #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO |
664 | 0 | #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 |
665 | 0 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) |
666 | 0 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) |
667 | 0 | #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA |
668 | | #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD |
669 | 0 | #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL |
670 | 0 | #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE |
671 | | |
672 | | #elif defined(__POWER9_VECTOR__) |
673 | | |
674 | | #define GGML_SIMD |
675 | | |
676 | | // F32 POWER9 |
677 | | |
678 | | #define GGML_F32_STEP 32 |
679 | | #define GGML_F32_EPR 4 |
680 | | |
681 | | #define GGML_F32x4 vector float |
682 | | #define GGML_F32x4_ZERO {0.0f} |
683 | | #define GGML_F32x4_SET1 vec_splats |
684 | | #define GGML_F32x4_LOAD(p) vec_xl(0, p) |
685 | | #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) |
686 | | #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
687 | | #define GGML_F32x4_ADD vec_add |
688 | | #define GGML_F32x4_MUL vec_mul |
689 | | #define GGML_F32x4_REDUCE(res, x) \ |
690 | | { \ |
691 | | int offset = GGML_F32_ARR >> 1; \ |
692 | | for (int i = 0; i < offset; ++i) { \ |
693 | | x[i] = vec_add(x[i], x[offset+i]); \ |
694 | | } \ |
695 | | offset >>= 1; \ |
696 | | for (int i = 0; i < offset; ++i) { \ |
697 | | x[i] = vec_add(x[i], x[offset+i]); \ |
698 | | } \ |
699 | | offset >>= 1; \ |
700 | | for (int i = 0; i < offset; ++i) { \ |
701 | | x[i] = vec_add(x[i], x[offset+i]); \ |
702 | | } \ |
703 | | res = vec_extract(x[0], 0) + \ |
704 | | vec_extract(x[0], 1) + \ |
705 | | vec_extract(x[0], 2) + \ |
706 | | vec_extract(x[0], 3); \ |
707 | | } |
708 | | #define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \ |
709 | | { \ |
710 | | vector float v = vec_add(vec_add(s0, s1), \ |
711 | | vec_add(s2, s3)); \ |
712 | | v = vec_add(v, vec_sld(v, v, 8)); \ |
713 | | v = vec_add(v, vec_sld(v, v, 4)); \ |
714 | | res += (ggml_float) vec_extract(v, 0); \ |
715 | | } |
716 | | |
717 | | #define GGML_F32_VEC GGML_F32x4 |
718 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
719 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
720 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
721 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
722 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
723 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
724 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
725 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
726 | | |
727 | | // F16 POWER9 |
728 | | #define GGML_F16_STEP GGML_F32_STEP |
729 | | #define GGML_F16_EPR GGML_F32_EPR |
730 | | #define GGML_F16_VEC GGML_F32x4 |
731 | | #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO |
732 | | #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 |
733 | | #define GGML_F16_VEC_FMA GGML_F32x4_FMA |
734 | | #define GGML_F16_VEC_ADD GGML_F32x4_ADD |
735 | | #define GGML_F16_VEC_MUL GGML_F32x4_MUL |
736 | | #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE |
737 | | // Use vec_xl, not vec_ld, in case the load address is not aligned. |
738 | | #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ |
739 | | vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ |
740 | | vec_extract_fp32_from_shortl(vec_xl(0, p)) |
741 | | static inline unsigned char ggml_endian_byte(int i) { |
742 | | uint16_t tmp_val = 1; |
743 | | return ((unsigned char *)&tmp_val)[i]; |
744 | | } |
745 | | #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i) |
746 | | #define GGML_F16_VEC_STORE(p, r, i) \ |
747 | | if (i & 0x1) \ |
748 | | vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ |
749 | | r[i - GGML_ENDIAN_BYTE(0)]), \ |
750 | | 0, p - GGML_F16_EPR) |
751 | | |
752 | | //BF16 POWER9 |
753 | | #define GGML_BF16_STEP 16 |
754 | | #define GGML_BF16_EPR 8 |
755 | | |
756 | | #define GGML_BF16x8 vector unsigned short |
757 | | #define GGML_BF16x8_ZERO vec_splats((unsigned short)0) |
758 | | #define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p)) |
759 | | |
760 | | #define GGML_BF16_VEC GGML_BF16x8 |
761 | | #define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO |
762 | | #define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD |
763 | | #if defined(__LITTLE_ENDIAN__) |
764 | | #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v))) |
765 | | #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v))) |
766 | | #else |
767 | | #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO)) |
768 | | #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO)) |
769 | | #endif |
770 | | #define GGML_BF16_FMA_LO(acc, x, y) \ |
771 | | (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y)) |
772 | | #define GGML_BF16_FMA_HI(acc, x, y) \ |
773 | | (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y)) |
774 | | |
775 | | #elif defined(__wasm_simd128__) |
776 | | |
777 | | #define GGML_SIMD |
778 | | |
779 | | // F32 WASM |
780 | | |
781 | | #define GGML_F32_STEP 16 |
782 | | #define GGML_F32_EPR 4 |
783 | | |
784 | | #define GGML_F32x4 v128_t |
785 | | #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) |
786 | | #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) |
787 | | #define GGML_F32x4_LOAD wasm_v128_load |
788 | | #define GGML_F32x4_STORE wasm_v128_store |
789 | | #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) |
790 | | #define GGML_F32x4_ADD wasm_f32x4_add |
791 | | #define GGML_F32x4_MUL wasm_f32x4_mul |
792 | | #define GGML_F32x4_REDUCE(res, x) \ |
793 | | { \ |
794 | | int offset = GGML_F32_ARR >> 1; \ |
795 | | for (int i = 0; i < offset; ++i) { \ |
796 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
797 | | } \ |
798 | | offset >>= 1; \ |
799 | | for (int i = 0; i < offset; ++i) { \ |
800 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
801 | | } \ |
802 | | offset >>= 1; \ |
803 | | for (int i = 0; i < offset; ++i) { \ |
804 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
805 | | } \ |
806 | | res = wasm_f32x4_extract_lane(x[0], 0) + \ |
807 | | wasm_f32x4_extract_lane(x[0], 1) + \ |
808 | | wasm_f32x4_extract_lane(x[0], 2) + \ |
809 | | wasm_f32x4_extract_lane(x[0], 3); \ |
810 | | } |
811 | | |
812 | | #define GGML_F32_VEC GGML_F32x4 |
813 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
814 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
815 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
816 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
817 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
818 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
819 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
820 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
821 | | |
822 | | // F16 WASM |
823 | | |
824 | | #define GGML_F16_STEP 16 |
825 | | #define GGML_F16_EPR 4 |
826 | | |
827 | | inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { |
828 | | float tmp[4]; |
829 | | |
830 | | tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]); |
831 | | tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]); |
832 | | tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]); |
833 | | tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]); |
834 | | |
835 | | return wasm_v128_load(tmp); |
836 | | } |
837 | | |
838 | | inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { |
839 | | float tmp[4]; |
840 | | |
841 | | wasm_v128_store(tmp, x); |
842 | | |
843 | | p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]); |
844 | | p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]); |
845 | | p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]); |
846 | | p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]); |
847 | | } |
848 | | |
849 | | #define GGML_F16x4 v128_t |
850 | | #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) |
851 | | #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) |
852 | | #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) |
853 | | #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) |
854 | | #define GGML_F16x4_FMA GGML_F32x4_FMA |
855 | | #define GGML_F16x4_ADD wasm_f32x4_add |
856 | | #define GGML_F16x4_MUL wasm_f32x4_mul |
857 | | #define GGML_F16x4_REDUCE(res, x) \ |
858 | | { \ |
859 | | int offset = GGML_F16_ARR >> 1; \ |
860 | | for (int i = 0; i < offset; ++i) { \ |
861 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
862 | | } \ |
863 | | offset >>= 1; \ |
864 | | for (int i = 0; i < offset; ++i) { \ |
865 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
866 | | } \ |
867 | | offset >>= 1; \ |
868 | | for (int i = 0; i < offset; ++i) { \ |
869 | | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
870 | | } \ |
871 | | res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \ |
872 | | wasm_f32x4_extract_lane(x[0], 1) + \ |
873 | | wasm_f32x4_extract_lane(x[0], 2) + \ |
874 | | wasm_f32x4_extract_lane(x[0], 3)); \ |
875 | | } |
876 | | |
877 | | #define GGML_F16_VEC GGML_F16x4 |
878 | | #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO |
879 | | #define GGML_F16_VEC_SET1 GGML_F16x4_SET1 |
880 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) |
881 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) |
882 | | #define GGML_F16_VEC_FMA GGML_F16x4_FMA |
883 | | #define GGML_F16_VEC_ADD GGML_F16x4_ADD |
884 | | #define GGML_F16_VEC_MUL GGML_F16x4_MUL |
885 | | #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE |
886 | | |
887 | | #elif defined(__SSE3__) |
888 | | |
889 | | #define GGML_SIMD |
890 | | |
891 | | // F32 SSE |
892 | | |
893 | | #define GGML_F32_STEP 32 |
894 | | #define GGML_F32_EPR 4 |
895 | | |
896 | | #define GGML_F32x4 __m128 |
897 | | #define GGML_F32x4_ZERO _mm_setzero_ps() |
898 | | #define GGML_F32x4_SET1(x) _mm_set1_ps(x) |
899 | | #define GGML_F32x4_LOAD _mm_loadu_ps |
900 | | #define GGML_F32x4_STORE _mm_storeu_ps |
901 | | #if defined(__FMA__) |
902 | | // TODO: Does this work? |
903 | | #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) |
904 | | #else |
905 | | #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) |
906 | | #endif |
907 | | #define GGML_F32x4_ADD _mm_add_ps |
908 | | #define GGML_F32x4_MUL _mm_mul_ps |
909 | | #define GGML_F32x4_REDUCE(res, x) \ |
910 | | { \ |
911 | | int offset = GGML_F32_ARR >> 1; \ |
912 | | for (int i = 0; i < offset; ++i) { \ |
913 | | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
914 | | } \ |
915 | | offset >>= 1; \ |
916 | | for (int i = 0; i < offset; ++i) { \ |
917 | | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
918 | | } \ |
919 | | offset >>= 1; \ |
920 | | for (int i = 0; i < offset; ++i) { \ |
921 | | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
922 | | } \ |
923 | | const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ |
924 | | res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ |
925 | | } |
926 | | // TODO: is this optimal ? |
927 | | |
928 | | #define GGML_F32_VEC GGML_F32x4 |
929 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
930 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
931 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
932 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
933 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
934 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
935 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
936 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
937 | | |
938 | | // F16 SSE |
939 | | |
940 | | #define GGML_F16_STEP 32 |
941 | | #define GGML_F16_EPR 4 |
942 | | |
943 | | static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { |
944 | | float tmp[4]; |
945 | | |
946 | | tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); |
947 | | tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); |
948 | | tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); |
949 | | tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); |
950 | | |
951 | | return _mm_loadu_ps(tmp); |
952 | | } |
953 | | |
954 | | static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { |
955 | | float arr[4]; |
956 | | |
957 | | _mm_storeu_ps(arr, y); |
958 | | |
959 | | x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); |
960 | | x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); |
961 | | x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); |
962 | | x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); |
963 | | } |
964 | | |
965 | | #define GGML_F32Cx4 __m128 |
966 | | #define GGML_F32Cx4_ZERO _mm_setzero_ps() |
967 | | #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) |
968 | | #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) |
969 | | #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) |
970 | | #define GGML_F32Cx4_FMA GGML_F32x4_FMA |
971 | | #define GGML_F32Cx4_ADD _mm_add_ps |
972 | | #define GGML_F32Cx4_MUL _mm_mul_ps |
973 | | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
974 | | |
975 | | #define GGML_F16_VEC GGML_F32Cx4 |
976 | | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
977 | | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
978 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
979 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) |
980 | | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
981 | | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
982 | | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
983 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
984 | | |
985 | | #elif defined(__loongarch_asx) |
986 | | |
987 | | #define GGML_SIMD |
988 | | |
989 | | // F32 LASX |
990 | | #define GGML_F32_STEP 32 |
991 | | #define GGML_F32_EPR 8 |
992 | | |
993 | | #define GGML_F32x8 __m256 |
994 | | #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) |
995 | | #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) |
996 | | #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) |
997 | | #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) |
998 | | #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) |
999 | | #define GGML_F32x8_ADD __lasx_xvfadd_s |
1000 | | #define GGML_F32x8_MUL __lasx_xvfmul_s |
1001 | | #define GGML_F32x8_REDUCE(res, x) \ |
1002 | | do { \ |
1003 | | int offset = GGML_F32_ARR >> 1; \ |
1004 | | for (int i = 0; i < offset; ++i) { \ |
1005 | | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
1006 | | } \ |
1007 | | offset >>= 1; \ |
1008 | | for (int i = 0; i < offset; ++i) { \ |
1009 | | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
1010 | | } \ |
1011 | | offset >>= 1; \ |
1012 | | for (int i = 0; i < offset; ++i) { \ |
1013 | | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
1014 | | } \ |
1015 | | float *tmp_p = (float *)&x[0]; \ |
1016 | | res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ |
1017 | | } while (0) |
1018 | | // TODO: is this optimal ? |
1019 | | |
1020 | | #define GGML_F32_VEC GGML_F32x8 |
1021 | | #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO |
1022 | | #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 |
1023 | | #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD |
1024 | | #define GGML_F32_VEC_STORE GGML_F32x8_STORE |
1025 | | #define GGML_F32_VEC_FMA GGML_F32x8_FMA |
1026 | | #define GGML_F32_VEC_ADD GGML_F32x8_ADD |
1027 | | #define GGML_F32_VEC_MUL GGML_F32x8_MUL |
1028 | | #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE |
1029 | | |
1030 | | // F16 LASX |
1031 | | |
1032 | | #define GGML_F16_STEP 32 |
1033 | | #define GGML_F16_EPR 8 |
1034 | | |
1035 | | // F16 arithmetic is not supported by LASX, so we use F32 instead |
1036 | | |
1037 | | #define GGML_F32Cx8 __m256 |
1038 | | #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) |
1039 | | #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) |
1040 | | |
1041 | | static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { |
1042 | | __m256i a; |
1043 | | memcpy(&a, x, sizeof(ggml_fp16_t) * 8); |
1044 | | a = __lasx_xvpermi_d(a, 0 | (1 << 4)); |
1045 | | return __lasx_xvfcvtl_s_h(a); |
1046 | | } |
1047 | | |
1048 | | static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { |
1049 | | __m256i a = __lasx_xvfcvt_h_s(y, y); |
1050 | | a = __lasx_xvpermi_d(a, 0 | (2 << 2)); |
1051 | | memcpy(x, &a, sizeof(ggml_fp16_t) * 8); |
1052 | | } |
1053 | | #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) |
1054 | | #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) |
1055 | | |
1056 | | #define GGML_F32Cx8_FMA GGML_F32x8_FMA |
1057 | | #define GGML_F32Cx8_ADD __lasx_xvfadd_s |
1058 | | #define GGML_F32Cx8_MUL __lasx_xvfmul_s |
1059 | | #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE |
1060 | | |
1061 | | #define GGML_F16_VEC GGML_F32Cx8 |
1062 | | #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO |
1063 | | #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 |
1064 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) |
1065 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) |
1066 | | #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA |
1067 | | #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD |
1068 | | #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL |
1069 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE |
1070 | | |
1071 | | #elif defined(__loongarch_sx) |
1072 | | |
1073 | | #define GGML_SIMD |
1074 | | |
1075 | | // F32 LSX |
1076 | | |
1077 | | #define GGML_F32_STEP 32 |
1078 | | #define GGML_F32_EPR 4 |
1079 | | |
1080 | | #define GGML_F32x4 __m128 |
1081 | | #define GGML_F32x4_ZERO (__m128)__lsx_vldi(0) |
1082 | | #define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x)) |
1083 | | #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0) |
1084 | | #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0) |
1085 | | #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) |
1086 | | #define GGML_F32x4_ADD __lsx_vfadd_s |
1087 | | #define GGML_F32x4_MUL __lsx_vfmul_s |
1088 | | |
1089 | | #define GGML_F32x4_REDUCE(res, x) \ |
1090 | | { \ |
1091 | | int offset = GGML_F32_ARR >> 1; \ |
1092 | | for (int i = 0; i < offset; ++i) { \ |
1093 | | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
1094 | | } \ |
1095 | | offset >>= 1; \ |
1096 | | for (int i = 0; i < offset; ++i) { \ |
1097 | | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
1098 | | } \ |
1099 | | offset >>= 1; \ |
1100 | | for (int i = 0; i < offset; ++i) { \ |
1101 | | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
1102 | | } \ |
1103 | | __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \ |
1104 | | __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \ |
1105 | | __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \ |
1106 | | __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \ |
1107 | | __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \ |
1108 | | __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \ |
1109 | | res = (ggml_float) ((v4f32)t5)[0]; \ |
1110 | | } |
1111 | | |
1112 | | #define GGML_F32_VEC GGML_F32x4 |
1113 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
1114 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
1115 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
1116 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
1117 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
1118 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
1119 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
1120 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
1121 | | |
1122 | | // F16 LSX |
1123 | | |
1124 | | #define GGML_F16_STEP 32 |
1125 | | #define GGML_F16_EPR 4 |
1126 | | |
1127 | | static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { |
1128 | | float tmp[4]; |
1129 | | |
1130 | | tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); |
1131 | | tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); |
1132 | | tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); |
1133 | | tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); |
1134 | | |
1135 | | return (__m128)__lsx_vld(tmp, 0); |
1136 | | } |
1137 | | |
1138 | | static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { |
1139 | | float arr[4]; |
1140 | | |
1141 | | __lsx_vst(y, arr, 0); |
1142 | | |
1143 | | x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); |
1144 | | x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); |
1145 | | x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); |
1146 | | x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); |
1147 | | } |
1148 | | |
1149 | | #define GGML_F32Cx4 __m128 |
1150 | | #define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0) |
1151 | | #define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x)) |
1152 | | #define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x) |
1153 | | #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) |
1154 | | #define GGML_F32Cx4_FMA GGML_F32x4_FMA |
1155 | | #define GGML_F32Cx4_ADD __lsx_vfadd_s |
1156 | | #define GGML_F32Cx4_MUL __lsx_vfmul_s |
1157 | | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
1158 | | |
1159 | | #define GGML_F16_VEC GGML_F32Cx4 |
1160 | | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
1161 | | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
1162 | | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
1163 | | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) |
1164 | | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
1165 | | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
1166 | | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
1167 | | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
1168 | | |
1169 | | #elif defined(__VXE__) || defined(__VXE2__) |
1170 | | |
1171 | | #define GGML_SIMD |
1172 | | |
1173 | | // F32 s390x |
1174 | | |
1175 | | #define GGML_F32_STEP 32 |
1176 | | #define GGML_F32_EPR 4 |
1177 | | |
1178 | | #define GGML_F32x4 float32x4_t |
1179 | | #define GGML_F32x4_ZERO vec_splats(0.0f) |
1180 | | #define GGML_F32x4_SET1 vec_splats |
1181 | | #define GGML_F32x4_LOAD(p) vec_xl(0, p) |
1182 | | #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) |
1183 | | #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
1184 | | #define GGML_F32x4_ADD vec_add |
1185 | | #define GGML_F32x4_MUL vec_mul |
1186 | | #define GGML_F32x4_REDUCE(res, x) \ |
1187 | | { \ |
1188 | | int offset = GGML_F32_ARR >> 1; \ |
1189 | | for (int i = 0; i < offset; ++i) { \ |
1190 | | x[i] = vec_add(x[i], x[offset + i]); \ |
1191 | | } \ |
1192 | | offset >>= 1; \ |
1193 | | for (int i = 0; i < offset; ++i) { \ |
1194 | | x[i] = vec_add(x[i], x[offset + i]); \ |
1195 | | } \ |
1196 | | offset >>= 1; \ |
1197 | | for (int i = 0; i < offset; ++i) { \ |
1198 | | x[i] = vec_add(x[i], x[offset + i]); \ |
1199 | | } \ |
1200 | | float32x4_t tmp = x[0] + vec_reve(x[0]); \ |
1201 | | res = tmp[0] + tmp[1]; \ |
1202 | | } |
1203 | | #define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \ |
1204 | | { \ |
1205 | | float32x4_t v = vec_add(vec_add(s0, s1), \ |
1206 | | vec_add(s2, s3)); \ |
1207 | | v = vec_add(v, vec_sld(v, v, 8)); \ |
1208 | | v = vec_add(v, vec_sld(v, v, 4)); \ |
1209 | | res += (ggml_float)vec_extract(v, 0); \ |
1210 | | } |
1211 | | |
1212 | | #define GGML_F32_VEC GGML_F32x4 |
1213 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
1214 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
1215 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
1216 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
1217 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
1218 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
1219 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
1220 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
1221 | | |
1222 | | // F16 s390x |
1223 | | #define GGML_F16_STEP GGML_F32_STEP |
1224 | | #define GGML_F16_EPR GGML_F32_EPR |
1225 | | |
1226 | | static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { |
1227 | | float tmp[4]; |
1228 | | |
1229 | | for (int i = 0; i < 4; i++) { |
1230 | | tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); |
1231 | | } |
1232 | | |
1233 | | // note: keep type-cast here to prevent compiler bugs |
1234 | | // see: https://github.com/ggml-org/llama.cpp/issues/12846 |
1235 | | return vec_xl(0, (const float *)(tmp)); |
1236 | | } |
1237 | | |
1238 | | static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { |
1239 | | float arr[4]; |
1240 | | |
1241 | | // note: keep type-cast here to prevent compiler bugs |
1242 | | // see: https://github.com/ggml-org/llama.cpp/issues/12846 |
1243 | | vec_xst(v_y, 0, (float *)(arr)); |
1244 | | |
1245 | | for (int i = 0; i < 4; i++) { |
1246 | | x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); |
1247 | | } |
1248 | | } |
1249 | | |
1250 | | #define GGML_F16_VEC GGML_F32x4 |
1251 | | #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO |
1252 | | #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 |
1253 | | #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p) |
1254 | | #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i]) |
1255 | | #define GGML_F16_VEC_FMA GGML_F32x4_FMA |
1256 | | #define GGML_F16_VEC_ADD GGML_F32x4_ADD |
1257 | | #define GGML_F16_VEC_MUL GGML_F32x4_MUL |
1258 | | #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE |
1259 | | |
1260 | | // BF16 s390x |
1261 | | #define GGML_BF16_STEP 16 |
1262 | | #define GGML_BF16_EPR 8 |
1263 | | |
1264 | | #define GGML_BF16x8 __vector unsigned short |
1265 | | #define GGML_BF16x8_ZERO vec_splats((unsigned short)0) |
1266 | | #define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p)) |
1267 | | |
1268 | | #define GGML_BF16_VEC GGML_BF16x8 |
1269 | | #define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO |
1270 | | #define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD |
1271 | | #define GGML_BF16_TO_F32_LO(v) ((float32x4_t) vec_mergel((v), GGML_BF16_VEC_ZERO)) |
1272 | | #define GGML_BF16_TO_F32_HI(v) ((float32x4_t) vec_mergeh((v), GGML_BF16_VEC_ZERO)) |
1273 | | #define GGML_BF16_FMA_LO(acc, x, y) \ |
1274 | | (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y)) |
1275 | | #define GGML_BF16_FMA_HI(acc, x, y) \ |
1276 | | (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y)) |
1277 | | |
1278 | | #elif defined(__riscv_v_intrinsic) |
1279 | | |
1280 | | // compatible with vlen >= 128 |
1281 | | |
1282 | | #define GGML_SIMD |
1283 | | |
1284 | | // F32 |
1285 | | |
1286 | | #define GGML_F32_STEP 16 |
1287 | | #define GGML_F32_EPR 4 |
1288 | | |
1289 | | #define GGML_F32x4 vfloat32m1_t |
1290 | | #define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR) |
1291 | | #define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR) |
1292 | | #define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR) |
1293 | | #define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR) |
1294 | | #define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR) |
1295 | | #define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR) |
1296 | | #define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR) |
1297 | | |
1298 | | #define GGML_F32_VEC GGML_F32x4 |
1299 | | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
1300 | | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
1301 | | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
1302 | | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
1303 | | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
1304 | | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
1305 | | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
1306 | | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
1307 | | |
1308 | | #endif |
1309 | | |
1310 | | // GGML_F32_ARR / GGML_F16_ARR |
1311 | | // number of registers to use per step |
1312 | | #ifdef GGML_SIMD |
1313 | 0 | #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) |
1314 | 0 | #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) |
1315 | | #endif |
1316 | | |
1317 | | #ifdef __cplusplus |
1318 | | } |
1319 | | #endif |