/rust/registry/src/index.crates.io-1949cf8c6b5b557f/exr-1.74.0/src/compression/mod.rs
Line | Count | Source |
1 | | |
2 | | //! Contains the compression attribute definition |
3 | | //! and methods to compress and decompress data. |
4 | | |
5 | | |
6 | | // private modules make non-breaking changes easier |
7 | | mod zip; |
8 | | mod rle; |
9 | | mod piz; |
10 | | mod pxr24; |
11 | | mod b44; |
12 | | |
13 | | |
14 | | use std::convert::TryInto; |
15 | | use crate::meta::attribute::{IntegerBounds, SampleType, ChannelList}; |
16 | | use crate::error::{Result, Error, usize_to_i32, UnitResult}; |
17 | | use crate::meta::header::Header; |
18 | | |
19 | | |
20 | | /// A byte vector. |
21 | | pub type ByteVec = Vec<u8>; |
22 | | |
23 | | /// A byte slice. |
24 | | pub type Bytes<'s> = &'s [u8]; |
25 | | |
26 | | /// Specifies which compression method to use. |
27 | | /// Use uncompressed data for fastest loading and writing speeds. |
28 | | /// Use RLE compression for fast loading and writing with slight memory savings. |
29 | | /// Use ZIP compression for slow processing with large memory savings. |
30 | | #[derive(Debug, Clone, Copy, PartialEq)] |
31 | | pub enum Compression { |
32 | | |
33 | | /// Store uncompressed values. |
34 | | /// Produces large files that can be read and written very quickly. |
35 | | /// Consider using RLE instead, as it provides some compression with almost equivalent speed. |
36 | | Uncompressed, |
37 | | |
38 | | /// Produces slightly smaller files |
39 | | /// that can still be read and written rather quickly. |
40 | | /// The compressed file size is usually between 60 and 75 percent of the uncompressed size. |
41 | | /// Works best for images with large flat areas, such as masks and abstract graphics. |
42 | | /// This compression method is lossless. |
43 | | RLE, |
44 | | |
45 | | /// Uses ZIP compression to compress each line. Slowly produces small images |
46 | | /// which can be read with moderate speed. This compression method is lossless. |
47 | | /// Might be slightly faster but larger than `ZIP16´. |
48 | | ZIP1, // TODO ZIP { individual_lines: bool, compression_level: Option<u8> } // TODO specify zip compression level? |
49 | | |
50 | | /// Uses ZIP compression to compress blocks of 16 lines. Slowly produces small images |
51 | | /// which can be read with moderate speed. This compression method is lossless. |
52 | | /// Might be slightly slower but smaller than `ZIP1´. |
53 | | ZIP16, // TODO collapse with ZIP1 |
54 | | |
55 | | /// PIZ compression works well for noisy and natural images. Works better with larger tiles. |
56 | | /// Only supported for flat images, but not for deep data. |
57 | | /// This compression method is lossless. |
58 | | // A wavelet transform is applied to the pixel data, and the result is Huffman- |
59 | | // encoded. This scheme tends to provide the best compression ratio for the types of |
60 | | // images that are typically processed at Industrial Light & Magic. Files are |
61 | | // compressed and decompressed at roughly the same speed. For photographic |
62 | | // images with film grain, the files are reduced to between 35 and 55 percent of their |
63 | | // uncompressed size. |
64 | | // PIZ compression works well for scan-line based files, and also for tiled files with |
65 | | // large tiles, but small tiles do not shrink much. (PIZ-compressed data start with a |
66 | | // relatively long header; if the input to the compressor is short, adding the header |
67 | | // tends to offset any size reduction of the input.) |
68 | | PIZ, |
69 | | |
70 | | /// Like `ZIP1`, but reduces precision of `f32` images to `f24`. |
71 | | /// Therefore, this is lossless compression for `f16` and `u32` data, lossy compression for `f32` data. |
72 | | /// This compression method works well for depth |
73 | | /// buffers and similar images, where the possible range of values is very large, but |
74 | | /// where full 32-bit floating-point accuracy is not necessary. Rounding improves |
75 | | /// compression significantly by eliminating the pixels' 8 least significant bits, which |
76 | | /// tend to be very noisy, and therefore difficult to compress. |
77 | | /// This produces really small image files. Only supported for flat images, not for deep data. |
78 | | // After reducing 32-bit floating-point data to 24 bits by rounding (while leaving 16-bit |
79 | | // floating-point data unchanged), differences between horizontally adjacent pixels |
80 | | // are compressed with zlib, similar to ZIP. PXR24 compression preserves image |
81 | | // channels of type HALF and UINT exactly, but the relative error of FLOAT data |
82 | | // increases to about ???. |
83 | | PXR24, // TODO specify zip compression level? |
84 | | |
85 | | /// This is a lossy compression method for f16 images. |
86 | | /// It's the predecessor of the `B44A` compression, |
87 | | /// which has improved compression rates for uniformly colored areas. |
88 | | /// You should probably use `B44A` instead of the plain `B44`. |
89 | | /// |
90 | | /// Only supported for flat images, not for deep data. |
91 | | // lossy 4-by-4 pixel block compression, |
92 | | // flat fields are compressed more |
93 | | // Channels of type HALF are split into blocks of four by four pixels or 32 bytes. Each |
94 | | // block is then packed into 14 bytes, reducing the data to 44 percent of their |
95 | | // uncompressed size. When B44 compression is applied to RGB images in |
96 | | // combination with luminance/chroma encoding (see below), the size of the |
97 | | // compressed pixels is about 22 percent of the size of the original RGB data. |
98 | | // Channels of type UINT or FLOAT are not compressed. |
99 | | // Decoding is fast enough to allow real-time playback of B44-compressed OpenEXR |
100 | | // image sequences on commodity hardware. |
101 | | // The size of a B44-compressed file depends on the number of pixels in the image, |
102 | | // but not on the data in the pixels. All images with the same resolution and the same |
103 | | // set of channels have the same size. This can be advantageous for systems that |
104 | | // support real-time playback of image sequences; the predictable file size makes it |
105 | | // easier to allocate space on storage media efficiently. |
106 | | // B44 compression is only supported for flat images. |
107 | | B44, // TODO B44 { optimize_uniform_areas: bool } |
108 | | |
109 | | /// This is a lossy compression method for f16 images. |
110 | | /// All f32 and u32 channels will be stored without compression. |
111 | | /// All the f16 pixels are divided into 4x4 blocks. |
112 | | /// Each block is then compressed as a whole. |
113 | | /// |
114 | | /// The 32 bytes of a block will require only ~14 bytes after compression, |
115 | | /// independent of the actual pixel contents. With chroma subsampling, |
116 | | /// a block will be compressed to ~7 bytes. |
117 | | /// Uniformly colored blocks will be compressed to ~3 bytes. |
118 | | /// |
119 | | /// The 512 bytes of an f32 block will not be compressed at all. |
120 | | /// |
121 | | /// Should be fast enough for realtime playback. |
122 | | /// Only supported for flat images, not for deep data. |
123 | | B44A, // TODO collapse with B44 |
124 | | |
125 | | /// __This lossy compression is not yet supported by this implementation.__ |
126 | | // lossy DCT based compression, in blocks |
127 | | // of 32 scanlines. More efficient for partial buffer access. |
128 | | DWAA(Option<f32>), // TODO does this have a default value? make this non optional? default Compression Level setting is 45.0 |
129 | | |
130 | | /// __This lossy compression is not yet supported by this implementation.__ |
131 | | // lossy DCT based compression, in blocks |
132 | | // of 256 scanlines. More efficient space |
133 | | // wise and faster to decode full frames |
134 | | // than DWAA_COMPRESSION. |
135 | | DWAB(Option<f32>), // TODO collapse with DWAA. default Compression Level setting is 45.0 |
136 | | |
137 | | /// __This lossy compression is not yet supported by this implementation.__ |
138 | | // High-Throughput JPEG 2000 (32 lines) |
139 | | HTJ2K32, |
140 | | |
141 | | /// __This lossy compression is not yet supported by this implementation.__ |
142 | | // High-Throughput JPEG 2000 (256 lines) |
143 | | HTJ2K256, |
144 | | } |
145 | | |
146 | | impl std::fmt::Display for Compression { |
147 | 1 | fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
148 | 1 | write!(formatter, "{} compression", match self { |
149 | 0 | Compression::Uncompressed => "no", |
150 | 0 | Compression::RLE => "rle", |
151 | 0 | Compression::ZIP1 => "zip line", |
152 | 0 | Compression::ZIP16 => "zip block", |
153 | 0 | Compression::B44 => "b44", |
154 | 0 | Compression::B44A => "b44a", |
155 | 0 | Compression::DWAA(_) => "dwaa", |
156 | 1 | Compression::DWAB(_) => "dwab", |
157 | 0 | Compression::PIZ => "piz", |
158 | 0 | Compression::PXR24 => "pxr24", |
159 | 0 | Compression::HTJ2K32 => "ht j2k 32", |
160 | 0 | Compression::HTJ2K256 => "ht j2k 256", |
161 | | }) |
162 | 1 | } |
163 | | } |
164 | | |
165 | | |
166 | | |
167 | | impl Compression { |
168 | | |
169 | | /// Compress the image section, converting from native endian into with little-endian format. |
170 | 67.4k | pub fn compress_image_section_to_le(self, header: &Header, uncompressed_native_endian: ByteVec, pixel_section: IntegerBounds) -> Result<ByteVec> { |
171 | 67.4k | let max_tile_size = header.max_block_pixel_size(); |
172 | | |
173 | 67.4k | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug"); |
174 | 67.4k | if header.deep { assert!(self.supports_deep_data()) } |
175 | | |
176 | | use self::Compression::*; |
177 | 67.4k | let compressed_little_endian = match self { |
178 | | Uncompressed => { |
179 | 0 | return convert_current_to_little_endian( |
180 | 0 | uncompressed_native_endian, &header.channels, pixel_section |
181 | | ) |
182 | | }, |
183 | | |
184 | | // we need to clone here, because we might have to fallback to the uncompressed data later (when compressed data is larger than raw data) |
185 | 0 | ZIP16 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
186 | 0 | ZIP1 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
187 | 67.4k | RLE => rle::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
188 | 0 | PIZ => piz::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
189 | 0 | PXR24 => pxr24::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
190 | 0 | B44 => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, false), |
191 | 0 | B44A => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, true), |
192 | 0 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self))) |
193 | | }; |
194 | | |
195 | 67.4k | let compressed_little_endian = compressed_little_endian.map_err(|_| |
196 | 0 | Error::invalid(format!("pixels cannot be compressed ({})", self)) |
197 | 0 | )?; |
198 | | |
199 | 67.4k | if self == Uncompressed || compressed_little_endian.len() < uncompressed_native_endian.len() { |
200 | | // only write compressed if it actually is smaller than raw |
201 | 67.3k | Ok(compressed_little_endian) |
202 | | } |
203 | | else { |
204 | | // if we do not use compression, manually convert uncompressed data |
205 | 55 | convert_current_to_little_endian(uncompressed_native_endian, &header.channels, pixel_section) |
206 | | } |
207 | 67.4k | } |
208 | | |
209 | | /// Decompress the image section from bytes of little-endian format, returning native-endian format. |
210 | 72.5k | pub fn decompress_image_section_from_le(self, header: &Header, compressed_le: ByteVec, pixel_section: IntegerBounds, pedantic: bool) -> Result<ByteVec> { |
211 | 72.5k | let max_tile_size = header.max_block_pixel_size(); |
212 | | |
213 | 72.5k | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug"); |
214 | 72.5k | if header.deep { assert!(self.supports_deep_data()) } |
215 | | |
216 | 72.5k | let expected_byte_size = pixel_section.size.area() * header.channels.bytes_per_pixel; // FIXME this needs to account for subsampling anywhere |
217 | | |
218 | | // note: always true where self == Uncompressed |
219 | 72.5k | if compressed_le.len() == expected_byte_size { |
220 | | // the compressed data was larger than the raw data, so the small raw data has been written |
221 | 72 | convert_little_endian_to_current(compressed_le, &header.channels, pixel_section) |
222 | | } |
223 | | else { |
224 | | use self::Compression::*; |
225 | 72.4k | let bytes_ne = match self { |
226 | 2 | Uncompressed => convert_little_endian_to_current(compressed_le, &header.channels, pixel_section), |
227 | 7 | ZIP16 => zip::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
228 | 0 | ZIP1 => zip::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
229 | 72.2k | RLE => rle::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
230 | 63 | PIZ => piz::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
231 | 1 | PXR24 => pxr24::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
232 | 130 | B44 | B44A => b44::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic), |
233 | 1 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self))) |
234 | | }; |
235 | | |
236 | | // map all errors to compression errors |
237 | 72.4k | let bytes_ne = bytes_ne |
238 | 72.4k | .map_err(|decompression_error| match decompression_error { |
239 | 0 | Error::NotSupported(message) => |
240 | 0 | Error::unsupported(format!("yet unimplemented compression special case ({})", message)), |
241 | | |
242 | 204 | error => Error::invalid(format!( |
243 | 204 | "compressed {:?} data ({})", |
244 | 204 | self, error.to_string() |
245 | | )), |
246 | 204 | })?; |
247 | | |
248 | 72.2k | if bytes_ne.len() != expected_byte_size { |
249 | 146 | Err(Error::invalid("decompressed data")) |
250 | | } |
251 | | |
252 | 72.1k | else { Ok(bytes_ne) } |
253 | | } |
254 | 72.5k | } |
255 | | |
256 | | /// For scan line images and deep scan line images, one or more scan lines may be |
257 | | /// stored together as a scan line block. The number of scan lines per block |
258 | | /// depends on how the pixel data are compressed. |
259 | 961k | pub fn scan_lines_per_block(self) -> usize { |
260 | | use self::Compression::*; |
261 | 961k | match self { |
262 | 349k | Uncompressed | RLE | ZIP1 => 1, |
263 | 2.81k | ZIP16 | PXR24 => 16, |
264 | 609k | PIZ | B44 | B44A | DWAA(_) | HTJ2K32 => 32, |
265 | 0 | DWAB(_) | HTJ2K256 => 256, |
266 | | } |
267 | 961k | } |
268 | | |
269 | | /// Deep data can only be compressed using RLE or ZIP compression. |
270 | 0 | pub fn supports_deep_data(self) -> bool { |
271 | | use self::Compression::*; |
272 | 0 | match self { |
273 | | Uncompressed | RLE | ZIP1 => |
274 | 0 | true, |
275 | | |
276 | | ZIP16 | PXR24 | PIZ | B44 | B44A | |
277 | | DWAA(_) | DWAB(_) | HTJ2K256 | HTJ2K32 => |
278 | 0 | false, |
279 | | } |
280 | 0 | } |
281 | | |
282 | | /// Most compression methods will reconstruct the exact pixel bytes, |
283 | | /// but some might throw away unimportant data for specific types of samples. |
284 | 0 | pub fn is_lossless_for(self, sample_type: SampleType) -> bool { |
285 | | use self::Compression::*; |
286 | 0 | match self { |
287 | 0 | PXR24 => sample_type != SampleType::F32, // pxr reduces f32 to f24 |
288 | 0 | B44 | B44A => sample_type != SampleType::F16, // b44 only compresses f16 values, others are left uncompressed |
289 | 0 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => true, |
290 | 0 | DWAB(_) | DWAA(_) => false, |
291 | | } |
292 | 0 | } |
293 | | |
294 | | /// Most compression methods will reconstruct the exact pixel bytes, |
295 | | /// but some might throw away unimportant data in some cases. |
296 | 67.4k | pub fn may_loose_data(self) -> bool { |
297 | | use self::Compression::*; |
298 | 67.4k | match self { |
299 | 67.4k | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => false, |
300 | 0 | PXR24 | B44 | B44A | DWAB(_) | DWAA(_) => true, |
301 | | } |
302 | 67.4k | } |
303 | | |
304 | | /// Most compression methods will reconstruct the exact pixel bytes, |
305 | | /// but some might replace NaN with zeroes. |
306 | | /// This might also depend on the sample type of the pixels. |
307 | | /// Even a compression method that supports NaN might change the bit patterns of those NaNs. |
308 | 0 | pub fn supports_nan(self) -> bool { |
309 | | use self::Compression::*; |
310 | 0 | match self { |
311 | 0 | B44A | DWAB(_) | DWAA(_) => false, |
312 | 0 | Uncompressed | PXR24 | RLE | ZIP1 | ZIP16 | PIZ | B44 | HTJ2K32 | HTJ2K256 => true, |
313 | | } |
314 | 0 | } |
315 | | |
316 | | /// Most compression methods will reconstruct the exact pixel and NaN bits, |
317 | | /// but some might replace NaN bits with other NaN bits. |
318 | | /// This might also depend on the sample type of the pixels. |
319 | 0 | pub fn preserves_nan_bits(self) -> bool { |
320 | | use self::Compression::*; |
321 | 0 | match self { |
322 | 0 | B44A | PXR24 | DWAB(_) | DWAA(_) => false, |
323 | 0 | B44 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => true |
324 | | } |
325 | 0 | } |
326 | | |
327 | | } |
328 | | |
329 | | // see https://github.com/AcademySoftwareFoundation/openexr/blob/6a9f8af6e89547bcd370ae3cec2b12849eee0b54/OpenEXR/IlmImf/ImfMisc.cpp#L1456-L1541 |
330 | | |
331 | | #[allow(unused)] // allows the extra parameters to be unused |
332 | 67.4k | fn convert_current_to_little_endian(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> Result<ByteVec> { |
333 | | #[cfg(target_endian = "big")] |
334 | | reverse_block_endianness(&mut bytes, channels, rectangle)?; |
335 | | |
336 | 67.4k | Ok(bytes) |
337 | 67.4k | } |
338 | | |
339 | | #[allow(unused)] // allows the extra parameters to be unused |
340 | 72.3k | fn convert_little_endian_to_current(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> Result<ByteVec> { |
341 | | #[cfg(target_endian = "big")] |
342 | | reverse_block_endianness(&mut bytes, channels, rectangle)?; |
343 | | |
344 | 72.3k | Ok(bytes) |
345 | 72.3k | } |
346 | | |
347 | | #[allow(unused)] // unused when on little endian system |
348 | 0 | fn reverse_block_endianness(bytes: &mut [u8], channels: &ChannelList, rectangle: IntegerBounds) -> UnitResult { |
349 | 0 | let mut remaining_bytes: &mut [u8] = bytes; |
350 | | |
351 | 0 | for y in rectangle.position.y() .. rectangle.end().y() { |
352 | 0 | for channel in &channels.list { |
353 | 0 | let line_is_subsampled = mod_p(y, usize_to_i32(channel.sampling.y(), "sampling")?) != 0; |
354 | 0 | if line_is_subsampled { continue; } |
355 | | |
356 | 0 | let sample_count = rectangle.size.width() / channel.sampling.x(); |
357 | | |
358 | 0 | match channel.sample_type { |
359 | | SampleType::F16 => |
360 | 0 | remaining_bytes = convert_byte_chunks(reverse_2_bytes, 2, remaining_bytes, sample_count), |
361 | | |
362 | | SampleType::F32 => |
363 | 0 | remaining_bytes = convert_byte_chunks(reverse_4_bytes, 4, remaining_bytes, sample_count), |
364 | | |
365 | | SampleType::U32 => |
366 | 0 | remaining_bytes = convert_byte_chunks(reverse_4_bytes, 4, remaining_bytes, sample_count), |
367 | | } |
368 | | } |
369 | | } |
370 | | |
371 | | // Converts groups of bytes (e.g. 2 bytes), as many groups as specified. Returns a slice of the remaining bytes. |
372 | | #[inline] |
373 | 0 | fn convert_byte_chunks(convert_single_value: fn(&mut[u8]), batch_size: usize, bytes: &mut [u8], batch_count: usize) -> &mut [u8] { |
374 | 0 | let (line_bytes, rest) = bytes.split_at_mut(batch_count * batch_size); |
375 | 0 | let value_byte_chunks = line_bytes.chunks_exact_mut(batch_size); |
376 | | |
377 | 0 | for value_bytes in value_byte_chunks { |
378 | 0 | convert_single_value(value_bytes); |
379 | 0 | } |
380 | | |
381 | 0 | rest |
382 | 0 | } |
383 | | |
384 | 0 | debug_assert!(remaining_bytes.is_empty(), "not all bytes were converted to little endian"); |
385 | 0 | Ok(()) |
386 | 0 | } |
387 | | |
388 | | #[inline] |
389 | 0 | fn reverse_2_bytes(bytes: &mut [u8]){ |
390 | | // this code seems like it could be optimized easily by the compiler |
391 | 0 | let two_bytes: [u8; 2] = bytes.try_into().expect("invalid byte count"); |
392 | 0 | bytes.copy_from_slice(&[two_bytes[1], two_bytes[0]]); |
393 | 0 | } |
394 | | |
395 | | #[inline] |
396 | 0 | fn reverse_4_bytes(bytes: &mut [u8]){ |
397 | 0 | let four_bytes: [u8; 4] = bytes.try_into().expect("invalid byte count"); |
398 | 0 | bytes.copy_from_slice(&[four_bytes[3], four_bytes[2], four_bytes[1], four_bytes[0]]); |
399 | 0 | } |
400 | | |
401 | | #[inline] |
402 | 10.2k | fn div_p (x: i32, y: i32) -> i32 { |
403 | 10.2k | if x >= 0 { |
404 | 10.2k | if y >= 0 { x / y } |
405 | 0 | else { -(x / -y) } |
406 | | } |
407 | | else { |
408 | 0 | if y >= 0 { -((y-1-x) / y) } |
409 | 0 | else { (-y-1-x) / -y } |
410 | | } |
411 | 10.2k | } |
412 | | |
413 | | #[inline] |
414 | 10.2k | fn mod_p(x: i32, y: i32) -> i32 { |
415 | 10.2k | x - y * div_p(x, y) |
416 | 10.2k | } |
417 | | |
418 | | /// A collection of functions used to prepare data for compression. |
419 | | mod optimize_bytes { |
420 | | |
421 | | /// Integrate over all differences to the previous value in order to reconstruct sample values. |
422 | 72.2k | pub fn differences_to_samples(buffer: &mut [u8]) { |
423 | | // The naive implementation is very simple: |
424 | | // |
425 | | // for index in 1..buffer.len() { |
426 | | // buffer[index] = (buffer[index - 1] as i32 + buffer[index] as i32 - 128) as u8; |
427 | | // } |
428 | | // |
429 | | // But we process elements in pairs to take advantage of instruction-level parallelism. |
430 | | // When computations within a pair do not depend on each other, they can be processed in parallel. |
431 | | // Since this function is responsible for a very large chunk of execution time, |
432 | | // this tweak alone improves decoding performance of RLE images by 20%. |
433 | 72.2k | if let Some(first) = buffer.get(0) { |
434 | 72.2k | let mut previous = *first as i16; |
435 | 243M | for chunk in &mut buffer[1..].chunks_exact_mut(2) { |
436 | 243M | // no bounds checks here due to indices and chunk size being constant |
437 | 243M | let diff0 = chunk[0] as i16; |
438 | 243M | let diff1 = chunk[1] as i16; |
439 | 243M | // these two computations do not depend on each other, unlike in the naive version, |
440 | 243M | // so they can be executed by the CPU in parallel via instruction-level parallelism |
441 | 243M | let sample0 = (previous + diff0 - 128) as u8; |
442 | 243M | let sample1 = (previous + diff0 + diff1 - 128 * 2) as u8; |
443 | 243M | chunk[0] = sample0; |
444 | 243M | chunk[1] = sample1; |
445 | 243M | previous = sample1 as i16; |
446 | 243M | } |
447 | | // handle the remaining element at the end not processed by the loop over pairs, if present |
448 | 72.2k | for elem in &mut buffer[1..].chunks_exact_mut(2).into_remainder().iter_mut() { |
449 | 72.1k | let sample = (previous + *elem as i16 - 128) as u8; |
450 | 72.1k | *elem = sample; |
451 | 72.1k | previous = sample as i16; |
452 | 72.1k | } |
453 | 3 | } |
454 | 72.2k | } |
455 | | |
456 | | /// Derive over all values in order to produce differences to the previous value. |
457 | 67.4k | pub fn samples_to_differences(buffer: &mut [u8]){ |
458 | | // naive version: |
459 | | // for index in (1..buffer.len()).rev() { |
460 | | // buffer[index] = (buffer[index] as i32 - buffer[index - 1] as i32 + 128) as u8; |
461 | | // } |
462 | | // |
463 | | // But we process elements in batches to take advantage of autovectorization. |
464 | | // If the target platform has no vector instructions (e.g. 32-bit ARM without `-C target-cpu=native`) |
465 | | // this will instead take advantage of instruction-level parallelism. |
466 | 67.4k | if let Some(first) = buffer.get(0) { |
467 | 67.4k | let mut previous = *first as i16; |
468 | | // Chunk size is 16 because we process bytes (8 bits), |
469 | | // and 8*16 = 128 bits is the size of a typical SIMD register. |
470 | | // Even WASM has 128-bit SIMD registers. |
471 | 13.6M | for chunk in &mut buffer[1..].chunks_exact_mut(16) { |
472 | 13.6M | // no bounds checks here due to indices and chunk size being constant |
473 | 13.6M | let sample0 = chunk[0] as i16; |
474 | 13.6M | let sample1 = chunk[1] as i16; |
475 | 13.6M | let sample2 = chunk[2] as i16; |
476 | 13.6M | let sample3 = chunk[3] as i16; |
477 | 13.6M | let sample4 = chunk[4] as i16; |
478 | 13.6M | let sample5 = chunk[5] as i16; |
479 | 13.6M | let sample6 = chunk[6] as i16; |
480 | 13.6M | let sample7 = chunk[7] as i16; |
481 | 13.6M | let sample8 = chunk[8] as i16; |
482 | 13.6M | let sample9 = chunk[9] as i16; |
483 | 13.6M | let sample10 = chunk[10] as i16; |
484 | 13.6M | let sample11 = chunk[11] as i16; |
485 | 13.6M | let sample12 = chunk[12] as i16; |
486 | 13.6M | let sample13 = chunk[13] as i16; |
487 | 13.6M | let sample14 = chunk[14] as i16; |
488 | 13.6M | let sample15 = chunk[15] as i16; |
489 | 13.6M | // Unlike in decoding, computations in here are truly independent from each other, |
490 | 13.6M | // which enables the compiler to vectorize this loop. |
491 | 13.6M | // Even if the target platform has no vector instructions, |
492 | 13.6M | // so using more parallelism doesn't imply doing more work, |
493 | 13.6M | // and we're not really limited in how wide we can go. |
494 | 13.6M | chunk[0] = (sample0 - previous + 128) as u8; |
495 | 13.6M | chunk[1] = (sample1 - sample0 + 128) as u8; |
496 | 13.6M | chunk[2] = (sample2 - sample1 + 128) as u8; |
497 | 13.6M | chunk[3] = (sample3 - sample2 + 128) as u8; |
498 | 13.6M | chunk[4] = (sample4 - sample3 + 128) as u8; |
499 | 13.6M | chunk[5] = (sample5 - sample4 + 128) as u8; |
500 | 13.6M | chunk[6] = (sample6 - sample5 + 128) as u8; |
501 | 13.6M | chunk[7] = (sample7 - sample6 + 128) as u8; |
502 | 13.6M | chunk[8] = (sample8 - sample7 + 128) as u8; |
503 | 13.6M | chunk[9] = (sample9 - sample8 + 128) as u8; |
504 | 13.6M | chunk[10] = (sample10 - sample9 + 128) as u8; |
505 | 13.6M | chunk[11] = (sample11 - sample10 + 128) as u8; |
506 | 13.6M | chunk[12] = (sample12 - sample11 + 128) as u8; |
507 | 13.6M | chunk[13] = (sample13 - sample12 + 128) as u8; |
508 | 13.6M | chunk[14] = (sample14 - sample13 + 128) as u8; |
509 | 13.6M | chunk[15] = (sample15 - sample14 + 128) as u8; |
510 | 13.6M | previous = sample15; |
511 | 13.6M | } |
512 | | // Handle the remaining element at the end not processed by the loop over batches, if present |
513 | | // This is what the iterator-based version of this function would look like without vectorization |
514 | 1.01M | for elem in &mut buffer[1..].chunks_exact_mut(16).into_remainder().iter_mut() { |
515 | 1.01M | let diff = (*elem as i16 - previous + 128) as u8; |
516 | 1.01M | previous = *elem as i16; |
517 | 1.01M | *elem = diff; |
518 | 1.01M | } |
519 | 0 | } |
520 | 67.4k | } |
521 | | |
522 | | use std::cell::Cell; |
523 | | thread_local! { |
524 | | // A buffer for reusing between invocations of interleaving and deinterleaving. |
525 | | // Allocating memory is cheap, but zeroing or otherwise initializing it is not. |
526 | | // Doing it hundreds of times (once per block) would be expensive. |
527 | | // This optimization brings down the time spent in interleaving from 15% to 5%. |
528 | | static SCRATCH_SPACE: Cell<Vec<u8>> = Cell::new(Vec::new()); |
529 | | } |
530 | | |
531 | 139k | fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) { |
532 | 139k | SCRATCH_SPACE.with(|scratch_space| { |
533 | | // reuse a buffer if we've already initialized one |
534 | 139k | let mut buffer = scratch_space.take(); |
535 | 139k | if buffer.len() < length { |
536 | 4.81k | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. |
537 | 4.81k | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, |
538 | 4.81k | // but is not a big deal either way since it's not a hot codepath. |
539 | 4.81k | buffer = vec![0u8; length]; |
540 | 134k | } |
541 | | |
542 | | // call the function |
543 | 139k | func(&mut buffer[..length]); |
544 | | |
545 | | // save the internal buffer for reuse |
546 | 139k | scratch_space.set(buffer); |
547 | 139k | }); exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}>::{closure#0}Line | Count | Source | 532 | 72.2k | SCRATCH_SPACE.with(|scratch_space| { | 533 | | // reuse a buffer if we've already initialized one | 534 | 72.2k | let mut buffer = scratch_space.take(); | 535 | 72.2k | if buffer.len() < length { | 536 | 3.50k | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. | 537 | 3.50k | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, | 538 | 3.50k | // but is not a big deal either way since it's not a hot codepath. | 539 | 3.50k | buffer = vec![0u8; length]; | 540 | 68.7k | } | 541 | | | 542 | | // call the function | 543 | 72.2k | func(&mut buffer[..length]); | 544 | | | 545 | | // save the internal buffer for reuse | 546 | 72.2k | scratch_space.set(buffer); | 547 | 72.2k | }); |
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}>::{closure#0}Line | Count | Source | 532 | 67.4k | SCRATCH_SPACE.with(|scratch_space| { | 533 | | // reuse a buffer if we've already initialized one | 534 | 67.4k | let mut buffer = scratch_space.take(); | 535 | 67.4k | if buffer.len() < length { | 536 | 1.31k | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. | 537 | 1.31k | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, | 538 | 1.31k | // but is not a big deal either way since it's not a hot codepath. | 539 | 1.31k | buffer = vec![0u8; length]; | 540 | 66.1k | } | 541 | | | 542 | | // call the function | 543 | 67.4k | func(&mut buffer[..length]); | 544 | | | 545 | | // save the internal buffer for reuse | 546 | 67.4k | scratch_space.set(buffer); | 547 | 67.4k | }); |
|
548 | 139k | } exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}>Line | Count | Source | 531 | 72.2k | fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) { | 532 | 72.2k | SCRATCH_SPACE.with(|scratch_space| { | 533 | | // reuse a buffer if we've already initialized one | 534 | | let mut buffer = scratch_space.take(); | 535 | | if buffer.len() < length { | 536 | | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. | 537 | | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, | 538 | | // but is not a big deal either way since it's not a hot codepath. | 539 | | buffer = vec![0u8; length]; | 540 | | } | 541 | | | 542 | | // call the function | 543 | | func(&mut buffer[..length]); | 544 | | | 545 | | // save the internal buffer for reuse | 546 | | scratch_space.set(buffer); | 547 | | }); | 548 | 72.2k | } |
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}>Line | Count | Source | 531 | 67.4k | fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) { | 532 | 67.4k | SCRATCH_SPACE.with(|scratch_space| { | 533 | | // reuse a buffer if we've already initialized one | 534 | | let mut buffer = scratch_space.take(); | 535 | | if buffer.len() < length { | 536 | | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. | 537 | | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, | 538 | | // but is not a big deal either way since it's not a hot codepath. | 539 | | buffer = vec![0u8; length]; | 540 | | } | 541 | | | 542 | | // call the function | 543 | | func(&mut buffer[..length]); | 544 | | | 545 | | // save the internal buffer for reuse | 546 | | scratch_space.set(buffer); | 547 | | }); | 548 | 67.4k | } |
|
549 | | |
550 | | /// Interleave the bytes such that the second half of the array is every other byte. |
551 | 72.2k | pub fn interleave_byte_blocks(separated: &mut [u8]) { |
552 | 72.2k | with_reused_buffer(separated.len(), |interleaved| { |
553 | | |
554 | | // Split the two halves that we are going to interleave. |
555 | 72.2k | let (first_half, second_half) = separated.split_at((separated.len() + 1) / 2); |
556 | | // The first half can be 1 byte longer than the second if the length of the input is odd, |
557 | | // but the loop below only processes numbers in pairs. |
558 | | // To handle it, preserve the last element of the first slice, to be handled after the loop. |
559 | 72.2k | let first_half_last = first_half.last(); |
560 | | // Truncate the first half to match the lenght of the second one; more optimizer-friendly |
561 | 72.2k | let first_half_iter = &first_half[..second_half.len()]; |
562 | | |
563 | | // Main loop that performs the interleaving |
564 | 243M | for ((first, second), interleaved) in first_half_iter.iter().zip(second_half.iter()) |
565 | 243M | .zip(interleaved.chunks_exact_mut(2)) { |
566 | 243M | // The length of each chunk is known to be 2 at compile time, |
567 | 243M | // and each index is also a constant. |
568 | 243M | // This allows the compiler to remove the bounds checks. |
569 | 243M | interleaved[0] = *first; |
570 | 243M | interleaved[1] = *second; |
571 | 243M | } |
572 | | |
573 | | // If the length of the slice was odd, restore the last element of the first half that we saved |
574 | 72.2k | if interleaved.len() % 2 == 1 { |
575 | 71 | if let Some(value) = first_half_last { |
576 | 71 | // we can unwrap() here because we just checked that the lenght is non-zero: |
577 | 71 | // `% 2 == 1` will fail for zero |
578 | 71 | *interleaved.last_mut().unwrap() = *value; |
579 | 71 | } |
580 | 72.1k | } |
581 | | |
582 | | // write out the results |
583 | 72.2k | separated.copy_from_slice(&interleaved); |
584 | 72.2k | }); |
585 | 72.2k | } |
586 | | |
587 | | /// Separate the bytes such that the second half contains every other byte. |
588 | | /// This performs deinterleaving - the inverse of interleaving. |
589 | 67.4k | pub fn separate_bytes_fragments(source: &mut [u8]) { |
590 | 67.4k | with_reused_buffer(source.len(), |separated| { |
591 | | |
592 | | // Split the two halves that we are going to interleave. |
593 | 67.4k | let (first_half, second_half) = separated.split_at_mut((source.len() + 1) / 2); |
594 | | // The first half can be 1 byte longer than the second if the length of the input is odd, |
595 | | // but the loop below only processes numbers in pairs. |
596 | | // To handle it, preserve the last element of the input, to be handled after the loop. |
597 | 67.4k | let last = source.last(); |
598 | 67.4k | let first_half_iter = &mut first_half[..second_half.len()]; |
599 | | |
600 | | // Main loop that performs the deinterleaving |
601 | 109M | for ((first, second), interleaved) in first_half_iter.iter_mut().zip(second_half.iter_mut()) |
602 | 109M | .zip(source.chunks_exact(2)) { |
603 | 109M | // The length of each chunk is known to be 2 at compile time, |
604 | 109M | // and each index is also a constant. |
605 | 109M | // This allows the compiler to remove the bounds checks. |
606 | 109M | *first = interleaved[0]; |
607 | 109M | *second = interleaved[1]; |
608 | 109M | } |
609 | | |
610 | | // If the length of the slice was odd, restore the last element of the input that we saved |
611 | 67.4k | if source.len() % 2 == 1 { |
612 | 0 | if let Some(value) = last { |
613 | 0 | // we can unwrap() here because we just checked that the lenght is non-zero: |
614 | 0 | // `% 2 == 1` will fail for zero |
615 | 0 | *first_half.last_mut().unwrap() = *value; |
616 | 0 | } |
617 | 67.4k | } |
618 | | |
619 | | // write out the results |
620 | 67.4k | source.copy_from_slice(&separated); |
621 | 67.4k | }); |
622 | 67.4k | } |
623 | | |
624 | | |
625 | | #[cfg(test)] |
626 | | pub mod test { |
627 | | |
628 | | #[test] |
629 | | fn roundtrip_interleave(){ |
630 | | let source = vec![ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]; |
631 | | let mut modified = source.clone(); |
632 | | |
633 | | super::separate_bytes_fragments(&mut modified); |
634 | | super::interleave_byte_blocks(&mut modified); |
635 | | |
636 | | assert_eq!(source, modified); |
637 | | } |
638 | | |
639 | | #[test] |
640 | | fn roundtrip_derive(){ |
641 | | let source = vec![ 0, 1, 2, 7, 4, 5, 6, 7, 13, 9, 10 ]; |
642 | | let mut modified = source.clone(); |
643 | | |
644 | | super::samples_to_differences(&mut modified); |
645 | | super::differences_to_samples(&mut modified); |
646 | | |
647 | | assert_eq!(source, modified); |
648 | | } |
649 | | |
650 | | } |
651 | | } |
652 | | |
653 | | |
654 | | #[cfg(test)] |
655 | | mod test { |
656 | | use super::*; |
657 | | use crate::meta::attribute::ChannelDescription; |
658 | | use crate::block::samples::IntoNativeSample; |
659 | | |
660 | | #[test] |
661 | | fn roundtrip_endianness_mixed_channels(){ |
662 | | let a32 = ChannelDescription::new("A", SampleType::F32, true); |
663 | | let y16 = ChannelDescription::new("Y", SampleType::F16, true); |
664 | | let channels = ChannelList::new(smallvec![ a32, y16 ]); |
665 | | |
666 | | let data = vec![ |
667 | | 23582740683_f32.to_ne_bytes().as_slice(), |
668 | | 35827420683_f32.to_ne_bytes().as_slice(), |
669 | | 27406832358_f32.to_f16().to_ne_bytes().as_slice(), |
670 | | 74062358283_f32.to_f16().to_ne_bytes().as_slice(), |
671 | | |
672 | | 52582740683_f32.to_ne_bytes().as_slice(), |
673 | | 45827420683_f32.to_ne_bytes().as_slice(), |
674 | | 15406832358_f32.to_f16().to_ne_bytes().as_slice(), |
675 | | 65062358283_f32.to_f16().to_ne_bytes().as_slice(), |
676 | | ].into_iter().flatten().map(|x| *x).collect(); |
677 | | |
678 | | roundtrip_convert_endianness( |
679 | | data, &channels, |
680 | | IntegerBounds::from_dimensions((2, 2)) |
681 | | ); |
682 | | } |
683 | | |
684 | | fn roundtrip_convert_endianness( |
685 | | current_endian: ByteVec, channels: &ChannelList, rectangle: IntegerBounds |
686 | | ){ |
687 | | let little_endian = convert_current_to_little_endian( |
688 | | current_endian.clone(), channels, rectangle |
689 | | ).unwrap(); |
690 | | |
691 | | let current_endian_decoded = convert_little_endian_to_current( |
692 | | little_endian.clone(), channels, rectangle |
693 | | ).unwrap(); |
694 | | |
695 | | assert_eq!(current_endian, current_endian_decoded, "endianness conversion failed"); |
696 | | } |
697 | | } |