/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zune-jpeg-0.4.21/src/idct.rs
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2023. |
3 | | * |
4 | | * This software is free software; |
5 | | * |
6 | | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | | */ |
8 | | |
9 | | //! Routines for IDCT |
10 | | //! |
11 | | //! Essentially we provide 2 routines for IDCT, a scalar implementation and a not super optimized |
12 | | //! AVX2 one, i'll talk about them here. |
13 | | //! |
14 | | //! There are 2 reasons why we have the avx one |
15 | | //! 1. No one compiles with -C target-features=avx2 hence binaries won't probably take advantage(even |
16 | | //! if it exists). |
17 | | //! 2. AVX employs zero short circuit in a way the scalar code cannot employ it. |
18 | | //! - AVX does this by checking for MCU's whose 63 AC coefficients are zero and if true, it writes |
19 | | //! values directly, if false, it goes the long way of calculating. |
20 | | //! - Although this can be trivially implemented in the scalar version, it generates code |
21 | | //! I'm not happy width(scalar version that basically loops and that is too many branches for me) |
22 | | //! The avx one does a better job of using bitwise or's with (`_mm256_or_si256`) which is magnitudes of faster |
23 | | //! than anything I could come up with |
24 | | //! |
25 | | //! The AVX code also has some cool transpose_u16 instructions which look so complicated to be cool |
26 | | //! (spoiler alert, i barely understand how it works, that's why I credited the owner). |
27 | | //! |
28 | | #![allow( |
29 | | clippy::excessive_precision, |
30 | | clippy::unreadable_literal, |
31 | | clippy::module_name_repetitions, |
32 | | unused_parens, |
33 | | clippy::wildcard_imports |
34 | | )] |
35 | | |
36 | | use zune_core::log::debug; |
37 | | use zune_core::options::DecoderOptions; |
38 | | |
39 | | use crate::decoder::IDCTPtr; |
40 | | use crate::idct::scalar::idct_int; |
41 | | |
42 | | #[cfg(feature = "x86")] |
43 | | pub mod avx2; |
44 | | #[cfg(feature = "neon")] |
45 | | pub mod neon; |
46 | | |
47 | | pub mod scalar; |
48 | | |
49 | | /// Choose an appropriate IDCT function |
50 | | #[allow(unused_variables)] |
51 | 5.19k | pub fn choose_idct_func(options: &DecoderOptions) -> IDCTPtr { |
52 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
53 | | #[cfg(feature = "x86")] |
54 | | { |
55 | 5.19k | if options.use_avx2() { |
56 | | debug!("Using vector integer IDCT"); |
57 | | // use avx one |
58 | 5.19k | return crate::idct::avx2::idct_avx2; |
59 | 0 | } |
60 | | } |
61 | | #[cfg(target_arch = "aarch64")] |
62 | | #[cfg(feature = "neon")] |
63 | | { |
64 | | if options.use_neon() { |
65 | | debug!("Using vector integer IDCT"); |
66 | | return crate::idct::neon::idct_neon; |
67 | | } |
68 | | } |
69 | | debug!("Using scalar integer IDCT"); |
70 | | // use generic one |
71 | 0 | return idct_int; |
72 | 5.19k | } |
73 | | |
74 | | #[cfg(test)] |
75 | | #[allow(unreachable_code)] |
76 | | #[allow(dead_code)] |
77 | | mod tests { |
78 | | use super::*; |
79 | | |
80 | | #[test] |
81 | | fn idct_test0() { |
82 | | let stride = 8; |
83 | | let mut coeff = [10; 64]; |
84 | | let mut coeff2 = [10; 64]; |
85 | | let mut output_scalar = [0; 64]; |
86 | | let mut output_vector = [0; 64]; |
87 | | idct_fnc()(&mut coeff, &mut output_vector, stride); |
88 | | idct_int(&mut coeff2, &mut output_scalar, stride); |
89 | | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match"); |
90 | | } |
91 | | |
92 | | #[test] |
93 | | fn do_idct_test1() { |
94 | | let stride = 8; |
95 | | let mut coeff = [14; 64]; |
96 | | let mut coeff2 = [14; 64]; |
97 | | let mut output_scalar = [0; 64]; |
98 | | let mut output_vector = [0; 64]; |
99 | | idct_fnc()(&mut coeff, &mut output_vector, stride); |
100 | | idct_int(&mut coeff2, &mut output_scalar, stride); |
101 | | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match"); |
102 | | } |
103 | | |
104 | | #[test] |
105 | | fn do_idct_test2() { |
106 | | let stride = 8; |
107 | | let mut coeff = [0; 64]; |
108 | | coeff[0] = 255; |
109 | | coeff[63] = -256; |
110 | | let mut coeff2 = coeff; |
111 | | let mut output_scalar = [0; 64]; |
112 | | let mut output_vector = [0; 64]; |
113 | | idct_fnc()(&mut coeff, &mut output_vector, stride); |
114 | | idct_int(&mut coeff2, &mut output_scalar, stride); |
115 | | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match"); |
116 | | } |
117 | | |
118 | | #[test] |
119 | | fn do_idct_zeros() { |
120 | | let stride = 8; |
121 | | let mut coeff = [0; 64]; |
122 | | let mut coeff2 = [0; 64]; |
123 | | let mut output_scalar = [0; 64]; |
124 | | let mut output_vector = [0; 64]; |
125 | | idct_fnc()(&mut coeff, &mut output_vector, stride); |
126 | | idct_int(&mut coeff2, &mut output_scalar, stride); |
127 | | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match"); |
128 | | } |
129 | | |
130 | | fn idct_fnc() -> IDCTPtr { |
131 | | #[cfg(feature = "neon")] |
132 | | #[cfg(target_arch = "aarch64")] |
133 | | { |
134 | | use crate::idct::neon::idct_neon; |
135 | | return idct_neon; |
136 | | } |
137 | | |
138 | | #[cfg(feature = "x86")] |
139 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
140 | | { |
141 | | use crate::idct::avx2::idct_avx2; |
142 | | return idct_avx2; |
143 | | } |
144 | | |
145 | | idct_int |
146 | | } |
147 | | } |