/src/xnnpack/src/f32-gemm/gen/f32-gemm-1x16-minmax-avx-broadcast.c
Line | Count | Source |
1 | | // clang-format off |
2 | | // Auto-generated file. Do not edit! |
3 | | // Template: src/f32-gemm/avx-broadcast.c.in |
4 | | // Generator: tools/xngen |
5 | | // |
6 | | // Copyright 2019 Google LLC |
7 | | // |
8 | | // This source code is licensed under the BSD-style license found in the |
9 | | // LICENSE file in the root directory of this source tree. |
10 | | |
11 | | #include <assert.h> |
12 | | #include <stddef.h> |
13 | | #include <stdint.h> |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "src/xnnpack/common.h" |
18 | | #include "src/xnnpack/gemm.h" |
19 | | #include "src/xnnpack/microparams.h" |
20 | | |
21 | | |
22 | | void xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast( |
23 | | size_t mr, |
24 | | size_t nc, |
25 | | size_t kc, |
26 | | const float* restrict a, |
27 | | size_t a_stride, |
28 | | const float* restrict w, |
29 | | float* restrict c, |
30 | | size_t cm_stride, |
31 | | size_t cn_stride, |
32 | | const struct xnn_f32_minmax_params* restrict params) |
33 | 0 | { |
34 | 0 | assert(mr != 0); |
35 | 0 | assert(mr <= 1); |
36 | 0 | assert(nc != 0); |
37 | 0 | assert(kc != 0); |
38 | 0 | assert(kc % sizeof(float) == 0); |
39 | 0 | assert(a != NULL); |
40 | 0 | assert(w != NULL); |
41 | 0 | assert(c != NULL); |
42 | | |
43 | 0 | const float* a0 = a; |
44 | 0 | float* c0 = c; |
45 | |
|
46 | 0 | const __m256 vmin = _mm256_set1_ps(params->scalar.min); |
47 | 0 | const __m256 vmax = _mm256_set1_ps(params->scalar.max); |
48 | 0 | XNN_FORCE_REALIZATION(vmin); |
49 | 0 | XNN_FORCE_REALIZATION(vmax); |
50 | |
|
51 | 0 | do { |
52 | 0 | __m256 vacc0x01234567 = _mm256_load_ps(w + 0); |
53 | 0 | __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); |
54 | 0 | w += 16; |
55 | |
|
56 | 0 | size_t k = kc; |
57 | 0 | do { |
58 | 0 | const __m256 va0 = _mm256_broadcast_ss(a0); |
59 | 0 | a0 += 1; |
60 | |
|
61 | 0 | const __m256 vb01234567 = _mm256_load_ps(w); |
62 | 0 | const __m256 vb89ABCDEF = _mm256_load_ps(w + 8); |
63 | 0 | w += 16; |
64 | |
|
65 | 0 | vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567)); |
66 | 0 | vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); |
67 | |
|
68 | 0 | k -= sizeof(float); |
69 | 0 | } while (k != 0); |
70 | |
|
71 | 0 | vacc0x01234567 = _mm256_max_ps(vmin, vacc0x01234567); |
72 | 0 | vacc0x89ABCDEF = _mm256_max_ps(vmin, vacc0x89ABCDEF); |
73 | |
|
74 | 0 | vacc0x01234567 = _mm256_min_ps(vmax, vacc0x01234567); |
75 | 0 | vacc0x89ABCDEF = _mm256_min_ps(vmax, vacc0x89ABCDEF); |
76 | |
|
77 | 0 | if XNN_LIKELY(nc >= 16) { |
78 | 0 | _mm256_storeu_ps(c0, vacc0x01234567); |
79 | 0 | _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); |
80 | 0 | c0 = (float*) ((uintptr_t) c0 + cn_stride); |
81 | |
|
82 | 0 | a0 = (const float*) ((uintptr_t) a0 - kc); |
83 | |
|
84 | 0 | nc -= 16; |
85 | 0 | } else { |
86 | 0 | if (nc & 8) { |
87 | 0 | _mm256_storeu_ps(c0, vacc0x01234567); |
88 | |
|
89 | 0 | vacc0x01234567 = vacc0x89ABCDEF; |
90 | |
|
91 | 0 | c0 += 8; |
92 | 0 | } |
93 | 0 | __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567); |
94 | 0 | if (nc & 4) { |
95 | 0 | _mm_storeu_ps(c0, vacc0x0123); |
96 | |
|
97 | 0 | vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1); |
98 | |
|
99 | 0 | c0 += 4; |
100 | 0 | } |
101 | 0 | if (nc & 2) { |
102 | 0 | _mm_storel_pi((__m64*) c0, vacc0x0123); |
103 | |
|
104 | 0 | vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); |
105 | |
|
106 | 0 | c0 += 2; |
107 | 0 | } |
108 | 0 | if (nc & 1) { |
109 | 0 | _mm_store_ss(c0, vacc0x0123); |
110 | 0 | } |
111 | |
|
112 | 0 | nc = 0; |
113 | 0 | } |
114 | 0 | } while (nc != 0); |
115 | 0 | } |