/rust/registry/src/index.crates.io-1949cf8c6b5b557f/exr-1.73.0/src/compression/mod.rs
Line | Count | Source |
1 | | |
2 | | //! Contains the compression attribute definition |
3 | | //! and methods to compress and decompress data. |
4 | | |
5 | | |
6 | | // private modules make non-breaking changes easier |
7 | | mod zip; |
8 | | mod rle; |
9 | | mod piz; |
10 | | mod pxr24; |
11 | | mod b44; |
12 | | |
13 | | |
14 | | use std::convert::TryInto; |
15 | | use std::mem::size_of; |
16 | | use half::f16; |
17 | | use crate::meta::attribute::{IntegerBounds, SampleType, ChannelList}; |
18 | | use crate::error::{Result, Error, usize_to_i32}; |
19 | | use crate::meta::header::Header; |
20 | | |
21 | | |
22 | | /// A byte vector. |
23 | | pub type ByteVec = Vec<u8>; |
24 | | |
25 | | /// A byte slice. |
26 | | pub type Bytes<'s> = &'s [u8]; |
27 | | |
28 | | /// Specifies which compression method to use. |
29 | | /// Use uncompressed data for fastest loading and writing speeds. |
30 | | /// Use RLE compression for fast loading and writing with slight memory savings. |
31 | | /// Use ZIP compression for slow processing with large memory savings. |
32 | | #[derive(Debug, Clone, Copy, PartialEq)] |
33 | | pub enum Compression { |
34 | | |
35 | | /// Store uncompressed values. |
36 | | /// Produces large files that can be read and written very quickly. |
37 | | /// Consider using RLE instead, as it provides some compression with almost equivalent speed. |
38 | | Uncompressed, |
39 | | |
40 | | /// Produces slightly smaller files |
41 | | /// that can still be read and written rather quickly. |
42 | | /// The compressed file size is usually between 60 and 75 percent of the uncompressed size. |
43 | | /// Works best for images with large flat areas, such as masks and abstract graphics. |
44 | | /// This compression method is lossless. |
45 | | RLE, |
46 | | |
47 | | /// Uses ZIP compression to compress each line. Slowly produces small images |
48 | | /// which can be read with moderate speed. This compression method is lossless. |
49 | | /// Might be slightly faster but larger than `ZIP16´. |
50 | | ZIP1, // TODO ZIP { individual_lines: bool, compression_level: Option<u8> } // TODO specify zip compression level? |
51 | | |
52 | | /// Uses ZIP compression to compress blocks of 16 lines. Slowly produces small images |
53 | | /// which can be read with moderate speed. This compression method is lossless. |
54 | | /// Might be slightly slower but smaller than `ZIP1´. |
55 | | ZIP16, // TODO collapse with ZIP1 |
56 | | |
57 | | /// PIZ compression works well for noisy and natural images. Works better with larger tiles. |
58 | | /// Only supported for flat images, but not for deep data. |
59 | | /// This compression method is lossless. |
60 | | // A wavelet transform is applied to the pixel data, and the result is Huffman- |
61 | | // encoded. This scheme tends to provide the best compression ratio for the types of |
62 | | // images that are typically processed at Industrial Light & Magic. Files are |
63 | | // compressed and decompressed at roughly the same speed. For photographic |
64 | | // images with film grain, the files are reduced to between 35 and 55 percent of their |
65 | | // uncompressed size. |
66 | | // PIZ compression works well for scan-line based files, and also for tiled files with |
67 | | // large tiles, but small tiles do not shrink much. (PIZ-compressed data start with a |
68 | | // relatively long header; if the input to the compressor is short, adding the header |
69 | | // tends to offset any size reduction of the input.) |
70 | | PIZ, |
71 | | |
72 | | /// Like `ZIP1`, but reduces precision of `f32` images to `f24`. |
73 | | /// Therefore, this is lossless compression for `f16` and `u32` data, lossy compression for `f32` data. |
74 | | /// This compression method works well for depth |
75 | | /// buffers and similar images, where the possible range of values is very large, but |
76 | | /// where full 32-bit floating-point accuracy is not necessary. Rounding improves |
77 | | /// compression significantly by eliminating the pixels' 8 least significant bits, which |
78 | | /// tend to be very noisy, and therefore difficult to compress. |
79 | | /// This produces really small image files. Only supported for flat images, not for deep data. |
80 | | // After reducing 32-bit floating-point data to 24 bits by rounding (while leaving 16-bit |
81 | | // floating-point data unchanged), differences between horizontally adjacent pixels |
82 | | // are compressed with zlib, similar to ZIP. PXR24 compression preserves image |
83 | | // channels of type HALF and UINT exactly, but the relative error of FLOAT data |
84 | | // increases to about ???. |
85 | | PXR24, // TODO specify zip compression level? |
86 | | |
87 | | /// This is a lossy compression method for f16 images. |
88 | | /// It's the predecessor of the `B44A` compression, |
89 | | /// which has improved compression rates for uniformly colored areas. |
90 | | /// You should probably use `B44A` instead of the plain `B44`. |
91 | | /// |
92 | | /// Only supported for flat images, not for deep data. |
93 | | // lossy 4-by-4 pixel block compression, |
94 | | // flat fields are compressed more |
95 | | // Channels of type HALF are split into blocks of four by four pixels or 32 bytes. Each |
96 | | // block is then packed into 14 bytes, reducing the data to 44 percent of their |
97 | | // uncompressed size. When B44 compression is applied to RGB images in |
98 | | // combination with luminance/chroma encoding (see below), the size of the |
99 | | // compressed pixels is about 22 percent of the size of the original RGB data. |
100 | | // Channels of type UINT or FLOAT are not compressed. |
101 | | // Decoding is fast enough to allow real-time playback of B44-compressed OpenEXR |
102 | | // image sequences on commodity hardware. |
103 | | // The size of a B44-compressed file depends on the number of pixels in the image, |
104 | | // but not on the data in the pixels. All images with the same resolution and the same |
105 | | // set of channels have the same size. This can be advantageous for systems that |
106 | | // support real-time playback of image sequences; the predictable file size makes it |
107 | | // easier to allocate space on storage media efficiently. |
108 | | // B44 compression is only supported for flat images. |
109 | | B44, // TODO B44 { optimize_uniform_areas: bool } |
110 | | |
111 | | /// This is a lossy compression method for f16 images. |
112 | | /// All f32 and u32 channels will be stored without compression. |
113 | | /// All the f16 pixels are divided into 4x4 blocks. |
114 | | /// Each block is then compressed as a whole. |
115 | | /// |
116 | | /// The 32 bytes of a block will require only ~14 bytes after compression, |
117 | | /// independent of the actual pixel contents. With chroma subsampling, |
118 | | /// a block will be compressed to ~7 bytes. |
119 | | /// Uniformly colored blocks will be compressed to ~3 bytes. |
120 | | /// |
121 | | /// The 512 bytes of an f32 block will not be compressed at all. |
122 | | /// |
123 | | /// Should be fast enough for realtime playback. |
124 | | /// Only supported for flat images, not for deep data. |
125 | | B44A, // TODO collapse with B44 |
126 | | |
127 | | /// __This lossy compression is not yet supported by this implementation.__ |
128 | | // lossy DCT based compression, in blocks |
129 | | // of 32 scanlines. More efficient for partial buffer access. |
130 | | DWAA(Option<f32>), // TODO does this have a default value? make this non optional? default Compression Level setting is 45.0 |
131 | | |
132 | | /// __This lossy compression is not yet supported by this implementation.__ |
133 | | // lossy DCT based compression, in blocks |
134 | | // of 256 scanlines. More efficient space |
135 | | // wise and faster to decode full frames |
136 | | // than DWAA_COMPRESSION. |
137 | | DWAB(Option<f32>), // TODO collapse with B44. default Compression Level setting is 45.0 |
138 | | } |
139 | | |
140 | | impl std::fmt::Display for Compression { |
141 | 0 | fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
142 | 0 | write!(formatter, "{} compression", match self { |
143 | 0 | Compression::Uncompressed => "no", |
144 | 0 | Compression::RLE => "rle", |
145 | 0 | Compression::ZIP1 => "zip line", |
146 | 0 | Compression::ZIP16 => "zip block", |
147 | 0 | Compression::B44 => "b44", |
148 | 0 | Compression::B44A => "b44a", |
149 | 0 | Compression::DWAA(_) => "dwaa", |
150 | 0 | Compression::DWAB(_) => "dwab", |
151 | 0 | Compression::PIZ => "piz", |
152 | 0 | Compression::PXR24 => "pxr24", |
153 | | }) |
154 | 0 | } |
155 | | } |
156 | | |
157 | | |
158 | | |
159 | | impl Compression { |
160 | | |
161 | | /// Compress the image section of bytes. |
162 | 0 | pub fn compress_image_section(self, header: &Header, uncompressed_native_endian: ByteVec, pixel_section: IntegerBounds) -> Result<ByteVec> { |
163 | 0 | let max_tile_size = header.max_block_pixel_size(); |
164 | | |
165 | 0 | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug"); |
166 | 0 | if header.deep { assert!(self.supports_deep_data()) } |
167 | | |
168 | | use self::Compression::*; |
169 | 0 | let compressed_little_endian = match self { |
170 | | Uncompressed => { |
171 | 0 | return Ok(convert_current_to_little_endian( |
172 | 0 | uncompressed_native_endian, &header.channels, pixel_section |
173 | 0 | )) |
174 | | }, |
175 | | |
176 | | // we need to clone here, because we might have to fallback to the uncompressed data later (when compressed data is larger than raw data) |
177 | 0 | ZIP16 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
178 | 0 | ZIP1 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
179 | 0 | RLE => rle::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
180 | 0 | PIZ => piz::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
181 | 0 | PXR24 => pxr24::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section), |
182 | 0 | B44 => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, false), |
183 | 0 | B44A => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, true), |
184 | 0 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self))) |
185 | | }; |
186 | | |
187 | 0 | let compressed_little_endian = compressed_little_endian.map_err(|_| |
188 | 0 | Error::invalid(format!("pixels cannot be compressed ({})", self)) |
189 | 0 | )?; |
190 | | |
191 | 0 | if self == Uncompressed || compressed_little_endian.len() < uncompressed_native_endian.len() { |
192 | | // only write compressed if it actually is smaller than raw |
193 | 0 | Ok(compressed_little_endian) |
194 | | } |
195 | | else { |
196 | | // if we do not use compression, manually convert uncompressed data |
197 | 0 | Ok(convert_current_to_little_endian(uncompressed_native_endian, &header.channels, pixel_section)) |
198 | | } |
199 | 0 | } |
200 | | |
201 | | /// Decompress the image section of bytes. |
202 | 0 | pub fn decompress_image_section(self, header: &Header, compressed: ByteVec, pixel_section: IntegerBounds, pedantic: bool) -> Result<ByteVec> { |
203 | 0 | let max_tile_size = header.max_block_pixel_size(); |
204 | | |
205 | 0 | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug"); |
206 | 0 | if header.deep { assert!(self.supports_deep_data()) } |
207 | | |
208 | 0 | let expected_byte_size = pixel_section.size.area() * header.channels.bytes_per_pixel; // FIXME this needs to account for subsampling anywhere |
209 | | |
210 | | // note: always true where self == Uncompressed |
211 | 0 | if compressed.len() == expected_byte_size { |
212 | | // the compressed data was larger than the raw data, so the small raw data has been written |
213 | 0 | Ok(convert_little_endian_to_current(compressed, &header.channels, pixel_section)) |
214 | | } |
215 | | else { |
216 | | use self::Compression::*; |
217 | 0 | let bytes = match self { |
218 | 0 | Uncompressed => Ok(convert_little_endian_to_current(compressed, &header.channels, pixel_section)), |
219 | 0 | ZIP16 => zip::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
220 | 0 | ZIP1 => zip::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
221 | 0 | RLE => rle::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
222 | 0 | PIZ => piz::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
223 | 0 | PXR24 => pxr24::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
224 | 0 | B44 | B44A => b44::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic), |
225 | 0 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self))) |
226 | | }; |
227 | | |
228 | | // map all errors to compression errors |
229 | 0 | let bytes = bytes |
230 | 0 | .map_err(|decompression_error| match decompression_error { |
231 | 0 | Error::NotSupported(message) => |
232 | 0 | Error::unsupported(format!("yet unimplemented compression special case ({})", message)), |
233 | | |
234 | 0 | error => Error::invalid(format!( |
235 | 0 | "compressed {:?} data ({})", |
236 | 0 | self, error.to_string() |
237 | | )), |
238 | 0 | })?; |
239 | | |
240 | 0 | if bytes.len() != expected_byte_size { |
241 | 0 | Err(Error::invalid("decompressed data")) |
242 | | } |
243 | | |
244 | 0 | else { Ok(bytes) } |
245 | | } |
246 | 0 | } |
247 | | |
248 | | /// For scan line images and deep scan line images, one or more scan lines may be |
249 | | /// stored together as a scan line block. The number of scan lines per block |
250 | | /// depends on how the pixel data are compressed. |
251 | 0 | pub fn scan_lines_per_block(self) -> usize { |
252 | | use self::Compression::*; |
253 | 0 | match self { |
254 | 0 | Uncompressed | RLE | ZIP1 => 1, |
255 | 0 | ZIP16 | PXR24 => 16, |
256 | 0 | PIZ | B44 | B44A | DWAA(_) => 32, |
257 | 0 | DWAB(_) => 256, |
258 | | } |
259 | 0 | } |
260 | | |
261 | | /// Deep data can only be compressed using RLE or ZIP compression. |
262 | 0 | pub fn supports_deep_data(self) -> bool { |
263 | | use self::Compression::*; |
264 | 0 | match self { |
265 | 0 | Uncompressed | RLE | ZIP1 => true, |
266 | 0 | _ => false, |
267 | | } |
268 | 0 | } |
269 | | |
270 | | /// Most compression methods will reconstruct the exact pixel bytes, |
271 | | /// but some might throw away unimportant data for specific types of samples. |
272 | 0 | pub fn is_lossless_for(self, sample_type: SampleType) -> bool { |
273 | | use self::Compression::*; |
274 | 0 | match self { |
275 | 0 | PXR24 => sample_type != SampleType::F32, // pxr reduces f32 to f24 |
276 | 0 | B44 | B44A => sample_type != SampleType::F16, // b44 only compresses f16 values, others are left uncompressed |
277 | 0 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ => true, |
278 | 0 | DWAB(_) | DWAA(_) => false, |
279 | | } |
280 | 0 | } |
281 | | |
282 | | /// Most compression methods will reconstruct the exact pixel bytes, |
283 | | /// but some might throw away unimportant data in some cases. |
284 | 0 | pub fn may_loose_data(self) -> bool { |
285 | | use self::Compression::*; |
286 | 0 | match self { |
287 | 0 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ => false, |
288 | 0 | PXR24 | B44 | B44A | DWAB(_) | DWAA(_) => true, |
289 | | } |
290 | 0 | } |
291 | | |
292 | | /// Most compression methods will reconstruct the exact pixel bytes, |
293 | | /// but some might replace NaN with zeroes. |
294 | 0 | pub fn supports_nan(self) -> bool { |
295 | | use self::Compression::*; |
296 | 0 | match self { |
297 | 0 | B44 | B44A | DWAB(_) | DWAA(_) => false, // TODO dwa might support it? |
298 | 0 | _ => true |
299 | | } |
300 | 0 | } |
301 | | |
302 | | } |
303 | | |
304 | | // see https://github.com/AcademySoftwareFoundation/openexr/blob/6a9f8af6e89547bcd370ae3cec2b12849eee0b54/OpenEXR/IlmImf/ImfMisc.cpp#L1456-L1541 |
305 | | |
306 | | #[allow(unused)] // allows the extra parameters to be unused |
307 | 0 | fn convert_current_to_little_endian(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> ByteVec { |
308 | | #[cfg(target = "big_endian")] |
309 | | reverse_block_endianness(&mut byte_vec, channels, rectangle); |
310 | | |
311 | 0 | bytes |
312 | 0 | } |
313 | | |
314 | | #[allow(unused)] // allows the extra parameters to be unused |
315 | 0 | fn convert_little_endian_to_current(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> ByteVec { |
316 | | #[cfg(target = "big_endian")] |
317 | | reverse_block_endianness(&mut bytes, channels, rectangle); |
318 | | |
319 | 0 | bytes |
320 | 0 | } |
321 | | |
322 | | #[allow(unused)] // unused when on little endian system |
323 | 0 | fn reverse_block_endianness(bytes: &mut [u8], channels: &ChannelList, rectangle: IntegerBounds){ |
324 | 0 | let mut remaining_bytes: &mut [u8] = bytes; |
325 | | |
326 | 0 | for y in rectangle.position.y() .. rectangle.end().y() { |
327 | 0 | for channel in &channels.list { |
328 | 0 | let line_is_subsampled = mod_p(y, usize_to_i32(channel.sampling.y())) != 0; |
329 | 0 | if line_is_subsampled { continue; } |
330 | | |
331 | 0 | let sample_count = rectangle.size.width() / channel.sampling.x(); |
332 | | |
333 | 0 | match channel.sample_type { |
334 | 0 | SampleType::F16 => remaining_bytes = chomp_convert_n::<f16>(reverse_2_bytes, remaining_bytes, sample_count), |
335 | 0 | SampleType::F32 => remaining_bytes = chomp_convert_n::<f32>(reverse_4_bytes, remaining_bytes, sample_count), |
336 | 0 | SampleType::U32 => remaining_bytes = chomp_convert_n::<u32>(reverse_4_bytes, remaining_bytes, sample_count), |
337 | | } |
338 | | } |
339 | | } |
340 | | |
341 | | #[inline] |
342 | 0 | fn chomp_convert_n<T>(convert_single_value: fn(&mut[u8]), mut bytes: &mut [u8], count: usize) -> &mut [u8] { |
343 | 0 | let type_size = size_of::<T>(); |
344 | 0 | let (line_bytes, rest) = bytes.split_at_mut(count * type_size); |
345 | 0 | let value_byte_chunks = line_bytes.chunks_exact_mut(type_size); |
346 | | |
347 | 0 | for value_bytes in value_byte_chunks { |
348 | 0 | convert_single_value(value_bytes); |
349 | 0 | } |
350 | | |
351 | 0 | rest |
352 | 0 | } |
353 | | |
354 | 0 | debug_assert!(remaining_bytes.is_empty(), "not all bytes were converted to little endian"); |
355 | 0 | } |
356 | | |
357 | | #[inline] |
358 | 0 | fn reverse_2_bytes(bytes: &mut [u8]){ |
359 | | // this code seems like it could be optimized easily by the compiler |
360 | 0 | let two_bytes: [u8; 2] = bytes.try_into().expect("invalid byte count"); |
361 | 0 | bytes.copy_from_slice(&[two_bytes[1], two_bytes[0]]); |
362 | 0 | } |
363 | | |
364 | | #[inline] |
365 | 0 | fn reverse_4_bytes(bytes: &mut [u8]){ |
366 | 0 | let four_bytes: [u8; 4] = bytes.try_into().expect("invalid byte count"); |
367 | 0 | bytes.copy_from_slice(&[four_bytes[3], four_bytes[2], four_bytes[1], four_bytes[0]]); |
368 | 0 | } |
369 | | |
370 | | #[inline] |
371 | 0 | fn div_p (x: i32, y: i32) -> i32 { |
372 | 0 | if x >= 0 { |
373 | 0 | if y >= 0 { x / y } |
374 | 0 | else { -(x / -y) } |
375 | | } |
376 | | else { |
377 | 0 | if y >= 0 { -((y-1-x) / y) } |
378 | 0 | else { (-y-1-x) / -y } |
379 | | } |
380 | 0 | } |
381 | | |
382 | | #[inline] |
383 | 0 | fn mod_p(x: i32, y: i32) -> i32 { |
384 | 0 | x - y * div_p(x, y) |
385 | 0 | } |
386 | | |
387 | | /// A collection of functions used to prepare data for compression. |
388 | | mod optimize_bytes { |
389 | | |
390 | | /// Integrate over all differences to the previous value in order to reconstruct sample values. |
391 | 0 | pub fn differences_to_samples(buffer: &mut [u8]) { |
392 | | // The naive implementation is very simple: |
393 | | // |
394 | | // for index in 1..buffer.len() { |
395 | | // buffer[index] = (buffer[index - 1] as i32 + buffer[index] as i32 - 128) as u8; |
396 | | // } |
397 | | // |
398 | | // But we process elements in pairs to take advantage of instruction-level parallelism. |
399 | | // When computations within a pair do not depend on each other, they can be processed in parallel. |
400 | | // Since this function is responsible for a very large chunk of execution time, |
401 | | // this tweak alone improves decoding performance of RLE images by 20%. |
402 | 0 | if let Some(first) = buffer.get(0) { |
403 | 0 | let mut previous = *first as i16; |
404 | 0 | for chunk in &mut buffer[1..].chunks_exact_mut(2) { |
405 | 0 | // no bounds checks here due to indices and chunk size being constant |
406 | 0 | let diff0 = chunk[0] as i16; |
407 | 0 | let diff1 = chunk[1] as i16; |
408 | 0 | // these two computations do not depend on each other, unlike in the naive version, |
409 | 0 | // so they can be executed by the CPU in parallel via instruction-level parallelism |
410 | 0 | let sample0 = (previous + diff0 - 128) as u8; |
411 | 0 | let sample1 = (previous + diff0 + diff1 - 128 * 2) as u8; |
412 | 0 | chunk[0] = sample0; |
413 | 0 | chunk[1] = sample1; |
414 | 0 | previous = sample1 as i16; |
415 | 0 | } |
416 | | // handle the remaining element at the end not processed by the loop over pairs, if present |
417 | 0 | for elem in &mut buffer[1..].chunks_exact_mut(2).into_remainder().iter_mut() { |
418 | 0 | let sample = (previous + *elem as i16 - 128) as u8; |
419 | 0 | *elem = sample; |
420 | 0 | previous = sample as i16; |
421 | 0 | } |
422 | 0 | } |
423 | 0 | } |
424 | | |
425 | | /// Derive over all values in order to produce differences to the previous value. |
426 | 0 | pub fn samples_to_differences(buffer: &mut [u8]){ |
427 | | // naive version: |
428 | | // for index in (1..buffer.len()).rev() { |
429 | | // buffer[index] = (buffer[index] as i32 - buffer[index - 1] as i32 + 128) as u8; |
430 | | // } |
431 | | // |
432 | | // But we process elements in batches to take advantage of autovectorization. |
433 | | // If the target platform has no vector instructions (e.g. 32-bit ARM without `-C target-cpu=native`) |
434 | | // this will instead take advantage of instruction-level parallelism. |
435 | 0 | if let Some(first) = buffer.get(0) { |
436 | 0 | let mut previous = *first as i16; |
437 | | // Chunk size is 16 because we process bytes (8 bits), |
438 | | // and 8*16 = 128 bits is the size of a typical SIMD register. |
439 | | // Even WASM has 128-bit SIMD registers. |
440 | 0 | for chunk in &mut buffer[1..].chunks_exact_mut(16) { |
441 | 0 | // no bounds checks here due to indices and chunk size being constant |
442 | 0 | let sample0 = chunk[0] as i16; |
443 | 0 | let sample1 = chunk[1] as i16; |
444 | 0 | let sample2 = chunk[2] as i16; |
445 | 0 | let sample3 = chunk[3] as i16; |
446 | 0 | let sample4 = chunk[4] as i16; |
447 | 0 | let sample5 = chunk[5] as i16; |
448 | 0 | let sample6 = chunk[6] as i16; |
449 | 0 | let sample7 = chunk[7] as i16; |
450 | 0 | let sample8 = chunk[8] as i16; |
451 | 0 | let sample9 = chunk[9] as i16; |
452 | 0 | let sample10 = chunk[10] as i16; |
453 | 0 | let sample11 = chunk[11] as i16; |
454 | 0 | let sample12 = chunk[12] as i16; |
455 | 0 | let sample13 = chunk[13] as i16; |
456 | 0 | let sample14 = chunk[14] as i16; |
457 | 0 | let sample15 = chunk[15] as i16; |
458 | 0 | // Unlike in decoding, computations in here are truly independent from each other, |
459 | 0 | // which enables the compiler to vectorize this loop. |
460 | 0 | // Even if the target platform has no vector instructions, |
461 | 0 | // so using more parallelism doesn't imply doing more work, |
462 | 0 | // and we're not really limited in how wide we can go. |
463 | 0 | chunk[0] = (sample0 - previous + 128) as u8; |
464 | 0 | chunk[1] = (sample1 - sample0 + 128) as u8; |
465 | 0 | chunk[2] = (sample2 - sample1 + 128) as u8; |
466 | 0 | chunk[3] = (sample3 - sample2 + 128) as u8; |
467 | 0 | chunk[4] = (sample4 - sample3 + 128) as u8; |
468 | 0 | chunk[5] = (sample5 - sample4 + 128) as u8; |
469 | 0 | chunk[6] = (sample6 - sample5 + 128) as u8; |
470 | 0 | chunk[7] = (sample7 - sample6 + 128) as u8; |
471 | 0 | chunk[8] = (sample8 - sample7 + 128) as u8; |
472 | 0 | chunk[9] = (sample9 - sample8 + 128) as u8; |
473 | 0 | chunk[10] = (sample10 - sample9 + 128) as u8; |
474 | 0 | chunk[11] = (sample11 - sample10 + 128) as u8; |
475 | 0 | chunk[12] = (sample12 - sample11 + 128) as u8; |
476 | 0 | chunk[13] = (sample13 - sample12 + 128) as u8; |
477 | 0 | chunk[14] = (sample14 - sample13 + 128) as u8; |
478 | 0 | chunk[15] = (sample15 - sample14 + 128) as u8; |
479 | 0 | previous = sample15; |
480 | 0 | } |
481 | | // Handle the remaining element at the end not processed by the loop over batches, if present |
482 | | // This is what the iterator-based version of this function would look like without vectorization |
483 | 0 | for elem in &mut buffer[1..].chunks_exact_mut(16).into_remainder().iter_mut() { |
484 | 0 | let diff = (*elem as i16 - previous + 128) as u8; |
485 | 0 | previous = *elem as i16; |
486 | 0 | *elem = diff; |
487 | 0 | } |
488 | 0 | } |
489 | 0 | } |
490 | | |
491 | | use std::cell::Cell; |
492 | | thread_local! { |
493 | | // A buffer for reusing between invocations of interleaving and deinterleaving. |
494 | | // Allocating memory is cheap, but zeroing or otherwise initializing it is not. |
495 | | // Doing it hundreds of times (once per block) would be expensive. |
496 | | // This optimization brings down the time spent in interleaving from 15% to 5%. |
497 | | static SCRATCH_SPACE: Cell<Vec<u8>> = Cell::new(Vec::new()); |
498 | | } |
499 | | |
500 | 0 | fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) { |
501 | 0 | SCRATCH_SPACE.with(|scratch_space| { |
502 | | // reuse a buffer if we've already initialized one |
503 | 0 | let mut buffer = scratch_space.take(); |
504 | 0 | if buffer.len() < length { |
505 | 0 | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS. |
506 | 0 | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise, |
507 | 0 | // but is not a big deal either way since it's not a hot codepath. |
508 | 0 | buffer = vec![0u8; length]; |
509 | 0 | } |
510 | | |
511 | | // call the function |
512 | 0 | func(&mut buffer[..length]); |
513 | | |
514 | | // save the internal buffer for reuse |
515 | 0 | scratch_space.set(buffer); |
516 | 0 | }); Unexecuted instantiation: exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}>::{closure#0} Unexecuted instantiation: exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}>::{closure#0} |
517 | 0 | } Unexecuted instantiation: exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}> Unexecuted instantiation: exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}> |
518 | | |
519 | | /// Interleave the bytes such that the second half of the array is every other byte. |
520 | 0 | pub fn interleave_byte_blocks(separated: &mut [u8]) { |
521 | 0 | with_reused_buffer(separated.len(), |interleaved| { |
522 | | |
523 | | // Split the two halves that we are going to interleave. |
524 | 0 | let (first_half, second_half) = separated.split_at((separated.len() + 1) / 2); |
525 | | // The first half can be 1 byte longer than the second if the length of the input is odd, |
526 | | // but the loop below only processes numbers in pairs. |
527 | | // To handle it, preserve the last element of the first slice, to be handled after the loop. |
528 | 0 | let first_half_last = first_half.last(); |
529 | | // Truncate the first half to match the lenght of the second one; more optimizer-friendly |
530 | 0 | let first_half_iter = &first_half[..second_half.len()]; |
531 | | |
532 | | // Main loop that performs the interleaving |
533 | 0 | for ((first, second), interleaved) in first_half_iter.iter().zip(second_half.iter()) |
534 | 0 | .zip(interleaved.chunks_exact_mut(2)) { |
535 | 0 | // The length of each chunk is known to be 2 at compile time, |
536 | 0 | // and each index is also a constant. |
537 | 0 | // This allows the compiler to remove the bounds checks. |
538 | 0 | interleaved[0] = *first; |
539 | 0 | interleaved[1] = *second; |
540 | 0 | } |
541 | | |
542 | | // If the length of the slice was odd, restore the last element of the first half that we saved |
543 | 0 | if interleaved.len() % 2 == 1 { |
544 | 0 | if let Some(value) = first_half_last { |
545 | 0 | // we can unwrap() here because we just checked that the lenght is non-zero: |
546 | 0 | // `% 2 == 1` will fail for zero |
547 | 0 | *interleaved.last_mut().unwrap() = *value; |
548 | 0 | } |
549 | 0 | } |
550 | | |
551 | | // write out the results |
552 | 0 | separated.copy_from_slice(&interleaved); |
553 | 0 | }); |
554 | 0 | } |
555 | | |
556 | | /// Separate the bytes such that the second half contains every other byte. |
557 | | /// This performs deinterleaving - the inverse of interleaving. |
558 | 0 | pub fn separate_bytes_fragments(source: &mut [u8]) { |
559 | 0 | with_reused_buffer(source.len(), |separated| { |
560 | | |
561 | | // Split the two halves that we are going to interleave. |
562 | 0 | let (first_half, second_half) = separated.split_at_mut((source.len() + 1) / 2); |
563 | | // The first half can be 1 byte longer than the second if the length of the input is odd, |
564 | | // but the loop below only processes numbers in pairs. |
565 | | // To handle it, preserve the last element of the input, to be handled after the loop. |
566 | 0 | let last = source.last(); |
567 | 0 | let first_half_iter = &mut first_half[..second_half.len()]; |
568 | | |
569 | | // Main loop that performs the deinterleaving |
570 | 0 | for ((first, second), interleaved) in first_half_iter.iter_mut().zip(second_half.iter_mut()) |
571 | 0 | .zip(source.chunks_exact(2)) { |
572 | 0 | // The length of each chunk is known to be 2 at compile time, |
573 | 0 | // and each index is also a constant. |
574 | 0 | // This allows the compiler to remove the bounds checks. |
575 | 0 | *first = interleaved[0]; |
576 | 0 | *second = interleaved[1]; |
577 | 0 | } |
578 | | |
579 | | // If the length of the slice was odd, restore the last element of the input that we saved |
580 | 0 | if source.len() % 2 == 1 { |
581 | 0 | if let Some(value) = last { |
582 | 0 | // we can unwrap() here because we just checked that the lenght is non-zero: |
583 | 0 | // `% 2 == 1` will fail for zero |
584 | 0 | *first_half.last_mut().unwrap() = *value; |
585 | 0 | } |
586 | 0 | } |
587 | | |
588 | | // write out the results |
589 | 0 | source.copy_from_slice(&separated); |
590 | 0 | }); |
591 | 0 | } |
592 | | |
593 | | |
594 | | #[cfg(test)] |
595 | | pub mod test { |
596 | | |
597 | | #[test] |
598 | | fn roundtrip_interleave(){ |
599 | | let source = vec![ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]; |
600 | | let mut modified = source.clone(); |
601 | | |
602 | | super::separate_bytes_fragments(&mut modified); |
603 | | super::interleave_byte_blocks(&mut modified); |
604 | | |
605 | | assert_eq!(source, modified); |
606 | | } |
607 | | |
608 | | #[test] |
609 | | fn roundtrip_derive(){ |
610 | | let source = vec![ 0, 1, 2, 7, 4, 5, 6, 7, 13, 9, 10 ]; |
611 | | let mut modified = source.clone(); |
612 | | |
613 | | super::samples_to_differences(&mut modified); |
614 | | super::differences_to_samples(&mut modified); |
615 | | |
616 | | assert_eq!(source, modified); |
617 | | } |
618 | | |
619 | | } |
620 | | } |
621 | | |
622 | | |
623 | | #[cfg(test)] |
624 | | pub mod test { |
625 | | use super::*; |
626 | | use crate::meta::attribute::ChannelDescription; |
627 | | use crate::block::samples::IntoNativeSample; |
628 | | |
629 | | #[test] |
630 | | fn roundtrip_endianness_mixed_channels(){ |
631 | | let a32 = ChannelDescription::new("A", SampleType::F32, true); |
632 | | let y16 = ChannelDescription::new("Y", SampleType::F16, true); |
633 | | let channels = ChannelList::new(smallvec![ a32, y16 ]); |
634 | | |
635 | | let data = vec![ |
636 | | 23582740683_f32.to_ne_bytes().as_slice(), |
637 | | 35827420683_f32.to_ne_bytes().as_slice(), |
638 | | 27406832358_f32.to_f16().to_ne_bytes().as_slice(), |
639 | | 74062358283_f32.to_f16().to_ne_bytes().as_slice(), |
640 | | |
641 | | 52582740683_f32.to_ne_bytes().as_slice(), |
642 | | 45827420683_f32.to_ne_bytes().as_slice(), |
643 | | 15406832358_f32.to_f16().to_ne_bytes().as_slice(), |
644 | | 65062358283_f32.to_f16().to_ne_bytes().as_slice(), |
645 | | ].into_iter().flatten().map(|x| *x).collect(); |
646 | | |
647 | | roundtrip_convert_endianness( |
648 | | data, &channels, |
649 | | IntegerBounds::from_dimensions((2, 2)) |
650 | | ); |
651 | | } |
652 | | |
653 | | fn roundtrip_convert_endianness( |
654 | | current_endian: ByteVec, channels: &ChannelList, rectangle: IntegerBounds |
655 | | ){ |
656 | | let little_endian = convert_current_to_little_endian( |
657 | | current_endian.clone(), channels, rectangle |
658 | | ); |
659 | | |
660 | | let current_endian_decoded = convert_little_endian_to_current( |
661 | | little_endian.clone(), channels, rectangle |
662 | | ); |
663 | | |
664 | | assert_eq!(current_endian, current_endian_decoded, "endianness conversion failed"); |
665 | | } |
666 | | } |