Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2020 Google LLC. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the LICENSE file. |
6 | | */ |
7 | | |
8 | | #ifndef GrVx_DEFINED |
9 | | #define GrVx_DEFINED |
10 | | |
11 | | #include "include/core/SkTypes.h" |
12 | | #include "include/private/SkVx.h" |
13 | | |
14 | | // grvx is Ganesh's addendum to skvx, Skia's SIMD library. Here we introduce functions that are |
15 | | // approximate and/or have LSB differences from platform to platform (e.g., by using hardware FMAs |
16 | | // when available). When a function is approximate, its error range is well documented and tested. |
17 | | namespace grvx { |
18 | | |
19 | | // Allow floating point contraction. e.g., allow a*x + y to be compiled to a single FMA even though |
20 | | // it introduces LSB differences on platforms that don't have an FMA instruction. |
21 | | #if defined(__clang__) |
22 | | #pragma STDC FP_CONTRACT ON |
23 | | #endif |
24 | | |
25 | | // Use familiar type names and functions from SkSL and GLSL. |
26 | | template<int N> using vec = skvx::Vec<N, float>; |
27 | | using float2 = vec<2>; |
28 | | using float4 = vec<4>; |
29 | | |
30 | | template<int N> using ivec = skvx::Vec<N, int32_t>; |
31 | | using int2 = ivec<2>; |
32 | | using int4 = ivec<4>; |
33 | | |
34 | | template<int N> using uvec = skvx::Vec<N, uint32_t>; |
35 | | using uint2 = uvec<2>; |
36 | | using uint4 = uvec<4>; |
37 | | |
38 | 0 | static SK_ALWAYS_INLINE float dot(float2 a, float2 b) { |
39 | 0 | float2 ab = a*b; |
40 | 0 | return ab[0] + ab[1]; |
41 | 0 | } Unexecuted instantiation: AAConvexPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AAHairLinePathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AALinearizingConvexPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: DefaultPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: TriangulatingPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: SkShadowTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrDistanceFieldGenFromVector.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrAAConvexTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathUtils.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrTriangulator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AtlasPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrFillRRectOp.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: TessellationPathRenderer.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrAtlasRenderTask.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathInnerTriangulateOp.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathStencilCoverOp.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathTessellateOp.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: StrokeTessellateOp.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathCurveTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathWedgeTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeFixedCountTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeHardwareTessellator.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathTessellationShader_Hardware.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathTessellationShader_MiddleOut.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader_HardwareImpl.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader_InstancedImpl.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: FuzzTriangulation.cpp:grvx::dot(skvx::Vec<2, float>, skvx::Vec<2, float>) |
42 | | |
43 | 0 | static SK_ALWAYS_INLINE float cross(float2 a, float2 b) { |
44 | 0 | float2 x = a*skvx::shuffle<1,0>(b); |
45 | 0 | return x[0] - x[1]; |
46 | 0 | } Unexecuted instantiation: AAConvexPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AAHairLinePathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AALinearizingConvexPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: DefaultPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: TriangulatingPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: SkShadowTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrDistanceFieldGenFromVector.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrAAConvexTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathUtils.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrTriangulator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: AtlasPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrFillRRectOp.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: TessellationPathRenderer.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrAtlasRenderTask.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathInnerTriangulateOp.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathStencilCoverOp.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: PathTessellateOp.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: StrokeTessellateOp.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathCurveTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathWedgeTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeFixedCountTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeHardwareTessellator.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathTessellationShader_Hardware.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrPathTessellationShader_MiddleOut.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader_HardwareImpl.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: GrStrokeTessellationShader_InstancedImpl.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) Unexecuted instantiation: FuzzTriangulation.cpp:grvx::cross(skvx::Vec<2, float>, skvx::Vec<2, float>) |
47 | | |
48 | | // Returns f*m + a. The actual implementation may or may not be fused, depending on hardware |
49 | | // support. We call this method "fast_madd" to draw attention to the fact that the operation may |
50 | | // give different results on different platforms. |
51 | 122k | template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) { |
52 | | #if FP_FAST_FMAF |
53 | | return skvx::fma(f,m,a); |
54 | | #else |
55 | 122k | return f*m + a; |
56 | 122k | #endif |
57 | 122k | } skvx::Vec<2, float> grvx::fast_madd<2>(skvx::Vec<2, float>, skvx::Vec<2, float>, skvx::Vec<2, float>) Line | Count | Source | 51 | 106k | template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) { | 52 | | #if FP_FAST_FMAF | 53 | | return skvx::fma(f,m,a); | 54 | | #else | 55 | 106k | return f*m + a; | 56 | 106k | #endif | 57 | 106k | } |
skvx::Vec<4, float> grvx::fast_madd<4>(skvx::Vec<4, float>, skvx::Vec<4, float>, skvx::Vec<4, float>) Line | Count | Source | 51 | 15.3k | template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) { | 52 | | #if FP_FAST_FMAF | 53 | | return skvx::fma(f,m,a); | 54 | | #else | 55 | 15.3k | return f*m + a; | 56 | 15.3k | #endif | 57 | 15.3k | } |
|
58 | | |
59 | | // Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial: |
60 | | // |
61 | | // acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2 |
62 | | // |
63 | | // See: https://stackoverflow.com/a/36387954 |
64 | | // |
65 | | // For a proof of max error, see the "grvx_approx_acos" unit test. |
66 | | // |
67 | | // NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are |
68 | | // infinite at -1 and 1). So the input must still be clamped between -1 and 1. |
69 | 0 | #define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f) |
70 | 0 | template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) { |
71 | 0 | constexpr static float a = -0.939115566365855f; |
72 | 0 | constexpr static float b = 0.9217841528914573f; |
73 | 0 | constexpr static float c = -1.2845906244690837f; |
74 | 0 | constexpr static float d = 0.295624144969963174f; |
75 | 0 | constexpr static float pi_over_2 = 1.5707963267948966f; |
76 | 0 | vec<N> xx = x*x; |
77 | 0 | vec<N> numer = fast_madd<N>(b,xx,a); |
78 | 0 | vec<N> denom = fast_madd<N>(xx, fast_madd<N>(d,xx,c), 1); |
79 | 0 | return fast_madd<N>(x, numer/denom, pi_over_2); |
80 | 0 | } |
81 | | |
82 | | // Approximates the angle between vectors a and b within .96 degrees (GRVX_FAST_ACOS_MAX_ERROR). |
83 | | // a (and b) represent "N" (Nx2/2) 2d vectors in SIMD, with the x values found in a.lo, and the |
84 | | // y values in a.hi. |
85 | | // |
86 | | // Due to fp32 overflow, this method is only valid for magnitudes in the range (2^-31, 2^31) |
87 | | // exclusive. Results are undefined if the inputs fall outside this range. |
88 | | // |
89 | | // NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately. |
90 | | // i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))". |
91 | | template<int Nx2> |
92 | | SK_ALWAYS_INLINE vec<Nx2/2> approx_angle_between_vectors(vec<Nx2> a, vec<Nx2> b) { |
93 | | auto aa=a*a, bb=b*b, ab=a*b; |
94 | | auto cosTheta = (ab.lo + ab.hi) / skvx::sqrt((aa.lo + aa.hi) * (bb.lo + bb.hi)); |
95 | | // Clamp cosTheta such that if it is NaN (e.g., if a or b was 0), then we return acos(1) = 0. |
96 | | cosTheta = skvx::max(skvx::min(1, cosTheta), -1); |
97 | | return approx_acos(cosTheta); |
98 | | } |
99 | | |
100 | | // De-interleaving load of 4 vectors. |
101 | | // |
102 | | // WARNING: These are really only supported well on NEON. Consider restructuring your data before |
103 | | // resorting to these methods. |
104 | | template<typename T> |
105 | | SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b, |
106 | | skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) { |
107 | | a.val = v[0]; |
108 | | b.val = v[1]; |
109 | | c.val = v[2]; |
110 | | d.val = v[3]; |
111 | | } |
112 | | template<int N, typename T> |
113 | | SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type |
114 | | strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, skvx::Vec<N,T>& c, |
115 | | skvx::Vec<N,T>& d) { |
116 | | strided_load4(v, a.lo, b.lo, c.lo, d.lo); |
117 | | strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi); |
118 | | } |
119 | | #if !defined(SKNX_NO_SIMD) |
120 | | #if defined(__ARM_NEON) |
121 | | #define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \ |
122 | | template<> \ |
123 | | SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, \ |
124 | | skvx::Vec<N,T>& c, skvx::Vec<N,T>& d) { \ |
125 | | auto mat = VLD(v); \ |
126 | | a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \ |
127 | | b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \ |
128 | | c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \ |
129 | | d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \ |
130 | | } |
131 | | IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32); |
132 | | IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16); |
133 | | IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8); |
134 | | IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32); |
135 | | IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16); |
136 | | IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8); |
137 | | IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32); |
138 | | IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32); |
139 | | IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16); |
140 | | IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8); |
141 | | IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32); |
142 | | IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16); |
143 | | IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8); |
144 | | IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32); |
145 | | #undef IMPL_LOAD4_TRANSPOSED |
146 | | #elif defined(__SSE__) |
147 | | template<> |
148 | 0 | SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) { |
149 | 0 | using skvx::bit_pun; |
150 | 0 | __m128 a_ = _mm_loadu_ps(v); |
151 | 0 | __m128 b_ = _mm_loadu_ps(v+4); |
152 | 0 | __m128 c_ = _mm_loadu_ps(v+8); |
153 | 0 | __m128 d_ = _mm_loadu_ps(v+12); |
154 | 0 | _MM_TRANSPOSE4_PS(a_, b_, c_, d_); |
155 | 0 | a = bit_pun<float4>(a_); |
156 | 0 | b = bit_pun<float4>(b_); |
157 | 0 | c = bit_pun<float4>(c_); |
158 | 0 | d = bit_pun<float4>(d_); |
159 | 0 | } |
160 | | #endif |
161 | | #endif |
162 | | |
163 | | // De-interleaving load of 2 vectors. |
164 | | // |
165 | | // WARNING: These are really only supported well on NEON. Consider restructuring your data before |
166 | | // resorting to these methods. |
167 | | template<typename T> |
168 | 0 | SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) { |
169 | 0 | a.val = v[0]; |
170 | 0 | b.val = v[1]; |
171 | 0 | } |
172 | | template<int N, typename T> |
173 | | SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type |
174 | 0 | strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { |
175 | 0 | strided_load2(v, a.lo, b.lo); |
176 | 0 | strided_load2(v + 2*(N/2), a.hi, b.hi); |
177 | 0 | } Unexecuted instantiation: std::__1::enable_if<(4)>=(2), void>::type grvx::strided_load2<4, float>(float const*, skvx::Vec<4, float>&, skvx::Vec<4, float>&) Unexecuted instantiation: std::__1::enable_if<(2)>=(2), void>::type grvx::strided_load2<2, float>(float const*, skvx::Vec<2, float>&, skvx::Vec<2, float>&) |
178 | | #if !defined(SKNX_NO_SIMD) |
179 | | #if defined(__ARM_NEON) |
180 | | #define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \ |
181 | | template<> \ |
182 | | SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \ |
183 | | auto mat = VLD(v); \ |
184 | | a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \ |
185 | | b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \ |
186 | | } |
187 | | IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32); |
188 | | IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16); |
189 | | IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8); |
190 | | IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32); |
191 | | IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16); |
192 | | IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8); |
193 | | IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32); |
194 | | IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32); |
195 | | IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16); |
196 | | IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8); |
197 | | IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32); |
198 | | IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16); |
199 | | IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8); |
200 | | IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32); |
201 | | #undef IMPL_LOAD2_TRANSPOSED |
202 | | #endif |
203 | | #endif |
204 | | |
205 | | #if defined(__clang__) |
206 | | #pragma STDC FP_CONTRACT DEFAULT |
207 | | #endif |
208 | | |
209 | | }; // namespace grvx |
210 | | |
211 | | #endif |