Coverage Report

Created: 2026-01-17 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/exr-1.74.0/src/compression/mod.rs
Line
Count
Source
1
2
//! Contains the compression attribute definition
3
//! and methods to compress and decompress data.
4
5
6
// private modules make non-breaking changes easier
7
mod zip;
8
mod rle;
9
mod piz;
10
mod pxr24;
11
mod b44;
12
13
14
use std::convert::TryInto;
15
use crate::meta::attribute::{IntegerBounds, SampleType, ChannelList};
16
use crate::error::{Result, Error, usize_to_i32, UnitResult};
17
use crate::meta::header::Header;
18
19
20
/// A byte vector.
21
pub type ByteVec = Vec<u8>;
22
23
/// A byte slice.
24
pub type Bytes<'s> = &'s [u8];
25
26
/// Specifies which compression method to use.
27
/// Use uncompressed data for fastest loading and writing speeds.
28
/// Use RLE compression for fast loading and writing with slight memory savings.
29
/// Use ZIP compression for slow processing with large memory savings.
30
#[derive(Debug, Clone, Copy, PartialEq)]
31
pub enum Compression {
32
33
    /// Store uncompressed values.
34
    /// Produces large files that can be read and written very quickly.
35
    /// Consider using RLE instead, as it provides some compression with almost equivalent speed.
36
    Uncompressed,
37
38
    /// Produces slightly smaller files
39
    /// that can still be read and written rather quickly.
40
    /// The compressed file size is usually between 60 and 75 percent of the uncompressed size.
41
    /// Works best for images with large flat areas, such as masks and abstract graphics.
42
    /// This compression method is lossless.
43
    RLE,
44
45
    /// Uses ZIP compression to compress each line. Slowly produces small images
46
    /// which can be read with moderate speed. This compression method is lossless.
47
    /// Might be slightly faster but larger than `ZIP16´.
48
    ZIP1,  // TODO ZIP { individual_lines: bool, compression_level: Option<u8> }  // TODO specify zip compression level?
49
50
    /// Uses ZIP compression to compress blocks of 16 lines. Slowly produces small images
51
    /// which can be read with moderate speed. This compression method is lossless.
52
    /// Might be slightly slower but smaller than `ZIP1´.
53
    ZIP16, // TODO collapse with ZIP1
54
55
    /// PIZ compression works well for noisy and natural images. Works better with larger tiles.
56
    /// Only supported for flat images, but not for deep data.
57
    /// This compression method is lossless.
58
    // A wavelet transform is applied to the pixel data, and the result is Huffman-
59
    // encoded. This scheme tends to provide the best compression ratio for the types of
60
    // images that are typically processed at Industrial Light & Magic. Files are
61
    // compressed and decompressed at roughly the same speed. For photographic
62
    // images with film grain, the files are reduced to between 35 and 55 percent of their
63
    // uncompressed size.
64
    // PIZ compression works well for scan-line based files, and also for tiled files with
65
    // large tiles, but small tiles do not shrink much. (PIZ-compressed data start with a
66
    // relatively long header; if the input to the compressor is short, adding the header
67
    // tends to offset any size reduction of the input.)
68
    PIZ,
69
70
    /// Like `ZIP1`, but reduces precision of `f32` images to `f24`.
71
    /// Therefore, this is lossless compression for `f16` and `u32` data, lossy compression for `f32` data.
72
    /// This compression method works well for depth
73
    /// buffers and similar images, where the possible range of values is very large, but
74
    /// where full 32-bit floating-point accuracy is not necessary. Rounding improves
75
    /// compression significantly by eliminating the pixels' 8 least significant bits, which
76
    /// tend to be very noisy, and therefore difficult to compress.
77
    /// This produces really small image files. Only supported for flat images, not for deep data.
78
    // After reducing 32-bit floating-point data to 24 bits by rounding (while leaving 16-bit
79
    // floating-point data unchanged), differences between horizontally adjacent pixels
80
    // are compressed with zlib, similar to ZIP. PXR24 compression preserves image
81
    // channels of type HALF and UINT exactly, but the relative error of FLOAT data
82
    // increases to about ???.
83
    PXR24, // TODO specify zip compression level?
84
85
    /// This is a lossy compression method for f16 images.
86
    /// It's the predecessor of the `B44A` compression,
87
    /// which has improved compression rates for uniformly colored areas.
88
    /// You should probably use `B44A` instead of the plain `B44`.
89
    ///
90
    /// Only supported for flat images, not for deep data.
91
    // lossy 4-by-4 pixel block compression,
92
    // flat fields are compressed more
93
    // Channels of type HALF are split into blocks of four by four pixels or 32 bytes. Each
94
    // block is then packed into 14 bytes, reducing the data to 44 percent of their
95
    // uncompressed size. When B44 compression is applied to RGB images in
96
    // combination with luminance/chroma encoding (see below), the size of the
97
    // compressed pixels is about 22 percent of the size of the original RGB data.
98
    // Channels of type UINT or FLOAT are not compressed.
99
    // Decoding is fast enough to allow real-time playback of B44-compressed OpenEXR
100
    // image sequences on commodity hardware.
101
    // The size of a B44-compressed file depends on the number of pixels in the image,
102
    // but not on the data in the pixels. All images with the same resolution and the same
103
    // set of channels have the same size. This can be advantageous for systems that
104
    // support real-time playback of image sequences; the predictable file size makes it
105
    // easier to allocate space on storage media efficiently.
106
    // B44 compression is only supported for flat images.
107
    B44, // TODO B44 { optimize_uniform_areas: bool }
108
109
    /// This is a lossy compression method for f16 images.
110
    /// All f32 and u32 channels will be stored without compression.
111
    /// All the f16 pixels are divided into 4x4 blocks.
112
    /// Each block is then compressed as a whole.
113
    ///
114
    /// The 32 bytes of a block will require only ~14 bytes after compression,
115
    /// independent of the actual pixel contents. With chroma subsampling,
116
    /// a block will be compressed to ~7 bytes.
117
    /// Uniformly colored blocks will be compressed to ~3 bytes.
118
    ///
119
    /// The 512 bytes of an f32 block will not be compressed at all.
120
    ///
121
    /// Should be fast enough for realtime playback.
122
    /// Only supported for flat images, not for deep data.
123
    B44A, // TODO collapse with B44
124
125
    /// __This lossy compression is not yet supported by this implementation.__
126
    // lossy DCT based compression, in blocks
127
    // of 32 scanlines. More efficient for partial buffer access.
128
    DWAA(Option<f32>), // TODO does this have a default value? make this non optional? default Compression Level setting is 45.0
129
130
    /// __This lossy compression is not yet supported by this implementation.__
131
    // lossy DCT based compression, in blocks
132
    // of 256 scanlines. More efficient space
133
    // wise and faster to decode full frames
134
    // than DWAA_COMPRESSION.
135
    DWAB(Option<f32>), // TODO collapse with DWAA. default Compression Level setting is 45.0
136
137
    /// __This lossy compression is not yet supported by this implementation.__
138
    // High-Throughput JPEG 2000 (32 lines)
139
    HTJ2K32,
140
141
    /// __This lossy compression is not yet supported by this implementation.__
142
    // High-Throughput JPEG 2000 (256 lines)
143
    HTJ2K256,
144
}
145
146
impl std::fmt::Display for Compression {
147
1
    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148
1
        write!(formatter, "{} compression", match self {
149
0
            Compression::Uncompressed => "no",
150
0
            Compression::RLE => "rle",
151
0
            Compression::ZIP1 => "zip line",
152
0
            Compression::ZIP16 => "zip block",
153
0
            Compression::B44 => "b44",
154
0
            Compression::B44A => "b44a",
155
0
            Compression::DWAA(_) => "dwaa",
156
1
            Compression::DWAB(_) => "dwab",
157
0
            Compression::PIZ => "piz",
158
0
            Compression::PXR24 => "pxr24",
159
0
            Compression::HTJ2K32 => "ht j2k 32",
160
0
            Compression::HTJ2K256 => "ht j2k 256",
161
        })
162
1
    }
163
}
164
165
166
167
impl Compression {
168
169
    /// Compress the image section, converting from native endian into with little-endian format.
170
67.4k
    pub fn compress_image_section_to_le(self, header: &Header, uncompressed_native_endian: ByteVec, pixel_section: IntegerBounds) -> Result<ByteVec> {
171
67.4k
        let max_tile_size = header.max_block_pixel_size();
172
173
67.4k
        assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug");
174
67.4k
        if header.deep { assert!(self.supports_deep_data()) }
175
176
        use self::Compression::*;
177
67.4k
        let compressed_little_endian = match self {
178
            Uncompressed => {
179
0
                return convert_current_to_little_endian(
180
0
                    uncompressed_native_endian, &header.channels, pixel_section
181
                )
182
            },
183
184
            // we need to clone here, because we might have to fallback to the uncompressed data later (when compressed data is larger than raw data)
185
0
            ZIP16 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
186
0
            ZIP1 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
187
67.4k
            RLE => rle::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
188
0
            PIZ => piz::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section),
189
0
            PXR24 => pxr24::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section),
190
0
            B44 => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, false),
191
0
            B44A => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, true),
192
0
            _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self)))
193
        };
194
195
67.4k
        let compressed_little_endian = compressed_little_endian.map_err(|_|
196
0
            Error::invalid(format!("pixels cannot be compressed ({})", self))
197
0
        )?;
198
199
67.4k
        if self == Uncompressed || compressed_little_endian.len() < uncompressed_native_endian.len() {
200
            // only write compressed if it actually is smaller than raw
201
67.3k
            Ok(compressed_little_endian)
202
        }
203
        else {
204
            // if we do not use compression, manually convert uncompressed data
205
55
            convert_current_to_little_endian(uncompressed_native_endian, &header.channels, pixel_section)
206
        }
207
67.4k
    }
208
209
    /// Decompress the image section from bytes of little-endian format, returning native-endian format.
210
72.5k
    pub fn decompress_image_section_from_le(self, header: &Header, compressed_le: ByteVec, pixel_section: IntegerBounds, pedantic: bool) -> Result<ByteVec> {
211
72.5k
        let max_tile_size = header.max_block_pixel_size();
212
213
72.5k
        assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug");
214
72.5k
        if header.deep { assert!(self.supports_deep_data()) }
215
216
72.5k
        let expected_byte_size = pixel_section.size.area() * header.channels.bytes_per_pixel; // FIXME this needs to account for subsampling anywhere
217
218
        // note: always true where self == Uncompressed
219
72.5k
        if compressed_le.len() == expected_byte_size {
220
            // the compressed data was larger than the raw data, so the small raw data has been written
221
72
            convert_little_endian_to_current(compressed_le, &header.channels, pixel_section)
222
        }
223
        else {
224
            use self::Compression::*;
225
72.4k
            let bytes_ne = match self {
226
2
                Uncompressed => convert_little_endian_to_current(compressed_le, &header.channels, pixel_section),
227
7
                ZIP16 => zip::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
228
0
                ZIP1 => zip::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
229
72.2k
                RLE => rle::decompress_bytes(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
230
63
                PIZ => piz::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
231
1
                PXR24 => pxr24::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
232
130
                B44 | B44A => b44::decompress(&header.channels, compressed_le, pixel_section, expected_byte_size, pedantic),
233
1
                _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}", self)))
234
            };
235
236
            // map all errors to compression errors
237
72.4k
            let bytes_ne = bytes_ne
238
72.4k
                .map_err(|decompression_error| match decompression_error {
239
0
                    Error::NotSupported(message) =>
240
0
                        Error::unsupported(format!("yet unimplemented compression special case ({})", message)),
241
242
204
                    error => Error::invalid(format!(
243
204
                        "compressed {:?} data ({})",
244
204
                        self, error.to_string()
245
                    )),
246
204
                })?;
247
248
72.2k
            if bytes_ne.len() != expected_byte_size {
249
146
                Err(Error::invalid("decompressed data"))
250
            }
251
252
72.1k
            else { Ok(bytes_ne) }
253
        }
254
72.5k
    }
255
256
    /// For scan line images and deep scan line images, one or more scan lines may be
257
    /// stored together as a scan line block. The number of scan lines per block
258
    /// depends on how the pixel data are compressed.
259
961k
    pub fn scan_lines_per_block(self) -> usize {
260
        use self::Compression::*;
261
961k
        match self {
262
349k
            Uncompressed | RLE     | ZIP1              => 1,
263
2.81k
            ZIP16   | PXR24                            => 16,
264
609k
            PIZ     | B44   | B44A | DWAA(_) | HTJ2K32 => 32,
265
0
            DWAB(_) | HTJ2K256                         => 256,
266
        }
267
961k
    }
268
269
    /// Deep data can only be compressed using RLE or ZIP compression.
270
0
    pub fn supports_deep_data(self) -> bool {
271
        use self::Compression::*;
272
0
        match self {
273
            Uncompressed | RLE | ZIP1 =>
274
0
                true,
275
276
            ZIP16 | PXR24 | PIZ | B44 | B44A |
277
            DWAA(_) | DWAB(_) | HTJ2K256 | HTJ2K32 =>
278
0
                false,
279
        }
280
0
    }
281
282
    /// Most compression methods will reconstruct the exact pixel bytes,
283
    /// but some might throw away unimportant data for specific types of samples.
284
0
    pub fn is_lossless_for(self, sample_type: SampleType) -> bool {
285
        use self::Compression::*;
286
0
        match self {
287
0
            PXR24 => sample_type != SampleType::F32, // pxr reduces f32 to f24
288
0
            B44 | B44A => sample_type != SampleType::F16, // b44 only compresses f16 values, others are left uncompressed
289
0
            Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => true,
290
0
            DWAB(_) | DWAA(_) => false,
291
        }
292
0
    }
293
294
    /// Most compression methods will reconstruct the exact pixel bytes,
295
    /// but some might throw away unimportant data in some cases.
296
67.4k
    pub fn may_loose_data(self) -> bool {
297
        use self::Compression::*;
298
67.4k
        match self {
299
67.4k
            Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => false,
300
0
            PXR24 | B44 | B44A | DWAB(_) | DWAA(_) => true,
301
        }
302
67.4k
    }
303
304
    /// Most compression methods will reconstruct the exact pixel bytes,
305
    /// but some might replace NaN with zeroes.
306
    /// This might also depend on the sample type of the pixels.
307
    /// Even a compression method that supports NaN might change the bit patterns of those NaNs.
308
0
    pub fn supports_nan(self) -> bool {
309
        use self::Compression::*;
310
0
        match self {
311
0
            B44A | DWAB(_) | DWAA(_) => false,
312
0
            Uncompressed | PXR24 | RLE | ZIP1 | ZIP16 | PIZ | B44 | HTJ2K32 | HTJ2K256 => true,
313
        }
314
0
    }
315
316
    /// Most compression methods will reconstruct the exact pixel and NaN bits,
317
    /// but some might replace NaN bits with other NaN bits.
318
    /// This might also depend on the sample type of the pixels.
319
0
    pub fn preserves_nan_bits(self) -> bool {
320
        use self::Compression::*;
321
0
        match self {
322
0
            B44A | PXR24 | DWAB(_) | DWAA(_) => false,
323
0
            B44 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ | HTJ2K32 | HTJ2K256 => true
324
        }
325
0
    }
326
327
}
328
329
// see https://github.com/AcademySoftwareFoundation/openexr/blob/6a9f8af6e89547bcd370ae3cec2b12849eee0b54/OpenEXR/IlmImf/ImfMisc.cpp#L1456-L1541
330
331
#[allow(unused)] // allows the extra parameters to be unused
332
67.4k
fn convert_current_to_little_endian(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> Result<ByteVec> {
333
    #[cfg(target_endian = "big")]
334
    reverse_block_endianness(&mut bytes, channels, rectangle)?;
335
336
67.4k
    Ok(bytes)
337
67.4k
}
338
339
#[allow(unused)] // allows the extra parameters to be unused
340
72.3k
fn convert_little_endian_to_current(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> Result<ByteVec> {
341
    #[cfg(target_endian = "big")]
342
    reverse_block_endianness(&mut bytes, channels, rectangle)?;
343
344
72.3k
    Ok(bytes)
345
72.3k
}
346
347
#[allow(unused)] // unused when on little endian system
348
0
fn reverse_block_endianness(bytes: &mut [u8], channels: &ChannelList, rectangle: IntegerBounds) -> UnitResult {
349
0
    let mut remaining_bytes: &mut [u8] = bytes;
350
351
0
    for y in rectangle.position.y() .. rectangle.end().y() {
352
0
        for channel in &channels.list {
353
0
            let line_is_subsampled = mod_p(y, usize_to_i32(channel.sampling.y(), "sampling")?) != 0;
354
0
            if line_is_subsampled { continue; }
355
356
0
            let sample_count = rectangle.size.width() / channel.sampling.x();
357
358
0
            match channel.sample_type {
359
                SampleType::F16 =>
360
0
                    remaining_bytes = convert_byte_chunks(reverse_2_bytes, 2, remaining_bytes, sample_count),
361
362
                SampleType::F32 =>
363
0
                    remaining_bytes = convert_byte_chunks(reverse_4_bytes, 4, remaining_bytes, sample_count),
364
365
                SampleType::U32 =>
366
0
                    remaining_bytes = convert_byte_chunks(reverse_4_bytes, 4, remaining_bytes, sample_count),
367
            }
368
        }
369
    }
370
371
    // Converts groups of bytes (e.g. 2 bytes), as many groups as specified. Returns a slice of the remaining bytes.
372
    #[inline]
373
0
    fn convert_byte_chunks(convert_single_value: fn(&mut[u8]), batch_size: usize, bytes: &mut [u8], batch_count: usize) -> &mut [u8] {
374
0
        let (line_bytes, rest) = bytes.split_at_mut(batch_count * batch_size);
375
0
        let value_byte_chunks = line_bytes.chunks_exact_mut(batch_size);
376
377
0
        for value_bytes in value_byte_chunks {
378
0
            convert_single_value(value_bytes);
379
0
        }
380
381
0
        rest
382
0
    }
383
384
0
    debug_assert!(remaining_bytes.is_empty(), "not all bytes were converted to little endian");
385
0
    Ok(())
386
0
}
387
388
#[inline]
389
0
fn reverse_2_bytes(bytes: &mut [u8]){
390
    // this code seems like it could be optimized easily by the compiler
391
0
    let two_bytes: [u8; 2] = bytes.try_into().expect("invalid byte count");
392
0
    bytes.copy_from_slice(&[two_bytes[1], two_bytes[0]]);
393
0
}
394
395
#[inline]
396
0
fn reverse_4_bytes(bytes: &mut [u8]){
397
0
    let four_bytes: [u8; 4] = bytes.try_into().expect("invalid byte count");
398
0
    bytes.copy_from_slice(&[four_bytes[3], four_bytes[2], four_bytes[1], four_bytes[0]]);
399
0
}
400
401
#[inline]
402
10.2k
fn div_p (x: i32, y: i32) -> i32 {
403
10.2k
    if x >= 0 {
404
10.2k
        if y >= 0 { x  / y }
405
0
        else { -(x  / -y) }
406
    }
407
    else {
408
0
        if y >= 0 { -((y-1-x) / y) }
409
0
        else { (-y-1-x) / -y }
410
    }
411
10.2k
}
412
413
#[inline]
414
10.2k
fn mod_p(x: i32, y: i32) -> i32 {
415
10.2k
    x - y * div_p(x, y)
416
10.2k
}
417
418
/// A collection of functions used to prepare data for compression.
419
mod optimize_bytes {
420
421
    /// Integrate over all differences to the previous value in order to reconstruct sample values.
422
72.2k
    pub fn differences_to_samples(buffer: &mut [u8]) {
423
        // The naive implementation is very simple:
424
        //
425
        // for index in 1..buffer.len() {
426
        //    buffer[index] = (buffer[index - 1] as i32 + buffer[index] as i32 - 128) as u8;
427
        // }
428
        //
429
        // But we process elements in pairs to take advantage of instruction-level parallelism.
430
        // When computations within a pair do not depend on each other, they can be processed in parallel.
431
        // Since this function is responsible for a very large chunk of execution time,
432
        // this tweak alone improves decoding performance of RLE images by 20%.
433
72.2k
        if let Some(first) = buffer.get(0) {
434
72.2k
            let mut previous = *first as i16;
435
243M
            for chunk in &mut buffer[1..].chunks_exact_mut(2) {
436
243M
                // no bounds checks here due to indices and chunk size being constant
437
243M
                let diff0 = chunk[0] as i16;
438
243M
                let diff1 = chunk[1] as i16;
439
243M
                // these two computations do not depend on each other, unlike in the naive version,
440
243M
                // so they can be executed by the CPU in parallel via instruction-level parallelism
441
243M
                let sample0 = (previous + diff0 - 128) as u8;
442
243M
                let sample1 = (previous + diff0 + diff1 - 128 * 2) as u8;
443
243M
                chunk[0] = sample0;
444
243M
                chunk[1] = sample1;
445
243M
                previous = sample1 as i16;
446
243M
            }
447
            // handle the remaining element at the end not processed by the loop over pairs, if present
448
72.2k
            for elem in &mut buffer[1..].chunks_exact_mut(2).into_remainder().iter_mut() {
449
72.1k
                let sample = (previous + *elem as i16 - 128) as u8;
450
72.1k
                *elem = sample;
451
72.1k
                previous = sample as i16;
452
72.1k
            }
453
3
        }
454
72.2k
    }
455
456
    /// Derive over all values in order to produce differences to the previous value.
457
67.4k
    pub fn samples_to_differences(buffer: &mut [u8]){
458
        // naive version:
459
        // for index in (1..buffer.len()).rev() {
460
        //     buffer[index] = (buffer[index] as i32 - buffer[index - 1] as i32 + 128) as u8;
461
        // }
462
        //
463
        // But we process elements in batches to take advantage of autovectorization.
464
        // If the target platform has no vector instructions (e.g. 32-bit ARM without `-C target-cpu=native`)
465
        // this will instead take advantage of instruction-level parallelism.
466
67.4k
        if let Some(first) = buffer.get(0) {
467
67.4k
            let mut previous = *first as i16;
468
            // Chunk size is 16 because we process bytes (8 bits),
469
            // and 8*16 = 128 bits is the size of a typical SIMD register.
470
            // Even WASM has 128-bit SIMD registers.
471
13.6M
            for chunk in &mut buffer[1..].chunks_exact_mut(16) {
472
13.6M
                // no bounds checks here due to indices and chunk size being constant
473
13.6M
                let sample0 = chunk[0] as i16;
474
13.6M
                let sample1 = chunk[1] as i16;
475
13.6M
                let sample2 = chunk[2] as i16;
476
13.6M
                let sample3 = chunk[3] as i16;
477
13.6M
                let sample4 = chunk[4] as i16;
478
13.6M
                let sample5 = chunk[5] as i16;
479
13.6M
                let sample6 = chunk[6] as i16;
480
13.6M
                let sample7 = chunk[7] as i16;
481
13.6M
                let sample8 = chunk[8] as i16;
482
13.6M
                let sample9 = chunk[9] as i16;
483
13.6M
                let sample10 = chunk[10] as i16;
484
13.6M
                let sample11 = chunk[11] as i16;
485
13.6M
                let sample12 = chunk[12] as i16;
486
13.6M
                let sample13 = chunk[13] as i16;
487
13.6M
                let sample14 = chunk[14] as i16;
488
13.6M
                let sample15 = chunk[15] as i16;
489
13.6M
                // Unlike in decoding, computations in here are truly independent from each other,
490
13.6M
                // which enables the compiler to vectorize this loop.
491
13.6M
                // Even if the target platform has no vector instructions,
492
13.6M
                // so using more parallelism doesn't imply doing more work,
493
13.6M
                // and we're not really limited in how wide we can go.
494
13.6M
                chunk[0] = (sample0 - previous + 128) as u8;
495
13.6M
                chunk[1] = (sample1 - sample0 + 128) as u8;
496
13.6M
                chunk[2] = (sample2 - sample1 + 128) as u8;
497
13.6M
                chunk[3] = (sample3 - sample2 + 128) as u8;
498
13.6M
                chunk[4] = (sample4 - sample3 + 128) as u8;
499
13.6M
                chunk[5] = (sample5 - sample4 + 128) as u8;
500
13.6M
                chunk[6] = (sample6 - sample5 + 128) as u8;
501
13.6M
                chunk[7] = (sample7 - sample6 + 128) as u8;
502
13.6M
                chunk[8] = (sample8 - sample7 + 128) as u8;
503
13.6M
                chunk[9] = (sample9 - sample8 + 128) as u8;
504
13.6M
                chunk[10] = (sample10 - sample9 + 128) as u8;
505
13.6M
                chunk[11] = (sample11 - sample10 + 128) as u8;
506
13.6M
                chunk[12] = (sample12 - sample11 + 128) as u8;
507
13.6M
                chunk[13] = (sample13 - sample12 + 128) as u8;
508
13.6M
                chunk[14] = (sample14 - sample13 + 128) as u8;
509
13.6M
                chunk[15] = (sample15 - sample14 + 128) as u8;
510
13.6M
                previous = sample15;
511
13.6M
            }
512
            // Handle the remaining element at the end not processed by the loop over batches, if present
513
            // This is what the iterator-based version of this function would look like without vectorization
514
1.01M
            for elem in &mut buffer[1..].chunks_exact_mut(16).into_remainder().iter_mut() {
515
1.01M
                let diff = (*elem as i16 - previous + 128) as u8;
516
1.01M
                previous = *elem as i16;
517
1.01M
                *elem = diff;
518
1.01M
            }
519
0
        }
520
67.4k
    }
521
522
    use std::cell::Cell;
523
    thread_local! {
524
        // A buffer for reusing between invocations of interleaving and deinterleaving.
525
        // Allocating memory is cheap, but zeroing or otherwise initializing it is not.
526
        // Doing it hundreds of times (once per block) would be expensive.
527
        // This optimization brings down the time spent in interleaving from 15% to 5%.
528
        static SCRATCH_SPACE: Cell<Vec<u8>> = Cell::new(Vec::new());
529
    }
530
531
139k
    fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) {
532
139k
        SCRATCH_SPACE.with(|scratch_space| {
533
            // reuse a buffer if we've already initialized one
534
139k
            let mut buffer = scratch_space.take();
535
139k
            if buffer.len() < length {
536
4.81k
                // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
537
4.81k
                // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
538
4.81k
                // but is not a big deal either way since it's not a hot codepath.
539
4.81k
                buffer = vec![0u8; length];
540
134k
            }
541
542
            // call the function
543
139k
            func(&mut buffer[..length]);
544
545
            // save the internal buffer for reuse
546
139k
            scratch_space.set(buffer);
547
139k
        });
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}>::{closure#0}
Line
Count
Source
532
72.2k
        SCRATCH_SPACE.with(|scratch_space| {
533
            // reuse a buffer if we've already initialized one
534
72.2k
            let mut buffer = scratch_space.take();
535
72.2k
            if buffer.len() < length {
536
3.50k
                // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
537
3.50k
                // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
538
3.50k
                // but is not a big deal either way since it's not a hot codepath.
539
3.50k
                buffer = vec![0u8; length];
540
68.7k
            }
541
542
            // call the function
543
72.2k
            func(&mut buffer[..length]);
544
545
            // save the internal buffer for reuse
546
72.2k
            scratch_space.set(buffer);
547
72.2k
        });
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}>::{closure#0}
Line
Count
Source
532
67.4k
        SCRATCH_SPACE.with(|scratch_space| {
533
            // reuse a buffer if we've already initialized one
534
67.4k
            let mut buffer = scratch_space.take();
535
67.4k
            if buffer.len() < length {
536
1.31k
                // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
537
1.31k
                // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
538
1.31k
                // but is not a big deal either way since it's not a hot codepath.
539
1.31k
                buffer = vec![0u8; length];
540
66.1k
            }
541
542
            // call the function
543
67.4k
            func(&mut buffer[..length]);
544
545
            // save the internal buffer for reuse
546
67.4k
            scratch_space.set(buffer);
547
67.4k
        });
548
139k
    }
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::interleave_byte_blocks::{closure#0}>
Line
Count
Source
531
72.2k
    fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) {
532
72.2k
        SCRATCH_SPACE.with(|scratch_space| {
533
            // reuse a buffer if we've already initialized one
534
            let mut buffer = scratch_space.take();
535
            if buffer.len() < length {
536
                // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
537
                // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
538
                // but is not a big deal either way since it's not a hot codepath.
539
                buffer = vec![0u8; length];
540
            }
541
542
            // call the function
543
            func(&mut buffer[..length]);
544
545
            // save the internal buffer for reuse
546
            scratch_space.set(buffer);
547
        });
548
72.2k
    }
exr::compression::optimize_bytes::with_reused_buffer::<exr::compression::optimize_bytes::separate_bytes_fragments::{closure#0}>
Line
Count
Source
531
67.4k
    fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) {
532
67.4k
        SCRATCH_SPACE.with(|scratch_space| {
533
            // reuse a buffer if we've already initialized one
534
            let mut buffer = scratch_space.take();
535
            if buffer.len() < length {
536
                // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
537
                // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
538
                // but is not a big deal either way since it's not a hot codepath.
539
                buffer = vec![0u8; length];
540
            }
541
542
            // call the function
543
            func(&mut buffer[..length]);
544
545
            // save the internal buffer for reuse
546
            scratch_space.set(buffer);
547
        });
548
67.4k
    }
549
550
    /// Interleave the bytes such that the second half of the array is every other byte.
551
72.2k
    pub fn interleave_byte_blocks(separated: &mut [u8]) {
552
72.2k
        with_reused_buffer(separated.len(), |interleaved| {
553
554
            // Split the two halves that we are going to interleave.
555
72.2k
            let (first_half, second_half) = separated.split_at((separated.len() + 1) / 2);
556
            // The first half can be 1 byte longer than the second if the length of the input is odd,
557
            // but the loop below only processes numbers in pairs.
558
            // To handle it, preserve the last element of the first slice, to be handled after the loop.
559
72.2k
            let first_half_last = first_half.last();
560
            // Truncate the first half to match the lenght of the second one; more optimizer-friendly
561
72.2k
            let first_half_iter = &first_half[..second_half.len()];
562
563
            // Main loop that performs the interleaving
564
243M
            for ((first, second), interleaved) in first_half_iter.iter().zip(second_half.iter())
565
243M
                .zip(interleaved.chunks_exact_mut(2)) {
566
243M
                    // The length of each chunk is known to be 2 at compile time,
567
243M
                    // and each index is also a constant.
568
243M
                    // This allows the compiler to remove the bounds checks.
569
243M
                    interleaved[0] = *first;
570
243M
                    interleaved[1] = *second;
571
243M
            }
572
573
            // If the length of the slice was odd, restore the last element of the first half that we saved
574
72.2k
            if interleaved.len() % 2 == 1 {
575
71
                if let Some(value) = first_half_last {
576
71
                    // we can unwrap() here because we just checked that the lenght is non-zero:
577
71
                    // `% 2 == 1` will fail for zero
578
71
                    *interleaved.last_mut().unwrap() = *value;
579
71
                }
580
72.1k
            }
581
582
            // write out the results
583
72.2k
            separated.copy_from_slice(&interleaved);
584
72.2k
        });
585
72.2k
    }
586
587
    /// Separate the bytes such that the second half contains every other byte.
588
    /// This performs deinterleaving - the inverse of interleaving.
589
67.4k
    pub fn separate_bytes_fragments(source: &mut [u8]) {
590
67.4k
        with_reused_buffer(source.len(), |separated| {
591
592
            // Split the two halves that we are going to interleave.
593
67.4k
            let (first_half, second_half) = separated.split_at_mut((source.len() + 1) / 2);
594
            // The first half can be 1 byte longer than the second if the length of the input is odd,
595
            // but the loop below only processes numbers in pairs.
596
            // To handle it, preserve the last element of the input, to be handled after the loop.
597
67.4k
            let last = source.last();
598
67.4k
            let first_half_iter = &mut first_half[..second_half.len()];
599
600
            // Main loop that performs the deinterleaving
601
109M
            for ((first, second), interleaved) in first_half_iter.iter_mut().zip(second_half.iter_mut())
602
109M
                .zip(source.chunks_exact(2)) {
603
109M
                    // The length of each chunk is known to be 2 at compile time,
604
109M
                    // and each index is also a constant.
605
109M
                    // This allows the compiler to remove the bounds checks.
606
109M
                    *first = interleaved[0];
607
109M
                    *second = interleaved[1];
608
109M
            }
609
610
            // If the length of the slice was odd, restore the last element of the input that we saved
611
67.4k
            if source.len() % 2 == 1 {
612
0
                if let Some(value) = last {
613
0
                    // we can unwrap() here because we just checked that the lenght is non-zero:
614
0
                    // `% 2 == 1` will fail for zero
615
0
                    *first_half.last_mut().unwrap() = *value;
616
0
                }
617
67.4k
            }
618
619
            // write out the results
620
67.4k
            source.copy_from_slice(&separated);
621
67.4k
        });
622
67.4k
    }
623
624
625
    #[cfg(test)]
626
    pub mod test {
627
628
        #[test]
629
        fn roundtrip_interleave(){
630
            let source = vec![ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ];
631
            let mut modified = source.clone();
632
633
            super::separate_bytes_fragments(&mut modified);
634
            super::interleave_byte_blocks(&mut modified);
635
636
            assert_eq!(source, modified);
637
        }
638
639
        #[test]
640
        fn roundtrip_derive(){
641
            let source = vec![ 0, 1, 2, 7, 4, 5, 6, 7, 13, 9, 10 ];
642
            let mut modified = source.clone();
643
644
            super::samples_to_differences(&mut modified);
645
            super::differences_to_samples(&mut modified);
646
647
            assert_eq!(source, modified);
648
        }
649
650
    }
651
}
652
653
654
#[cfg(test)]
655
mod test {
656
    use super::*;
657
    use crate::meta::attribute::ChannelDescription;
658
    use crate::block::samples::IntoNativeSample;
659
660
    #[test]
661
    fn roundtrip_endianness_mixed_channels(){
662
        let a32 = ChannelDescription::new("A", SampleType::F32, true);
663
        let y16 = ChannelDescription::new("Y", SampleType::F16, true);
664
        let channels = ChannelList::new(smallvec![ a32, y16 ]);
665
666
        let data = vec![
667
            23582740683_f32.to_ne_bytes().as_slice(),
668
            35827420683_f32.to_ne_bytes().as_slice(),
669
            27406832358_f32.to_f16().to_ne_bytes().as_slice(),
670
            74062358283_f32.to_f16().to_ne_bytes().as_slice(),
671
672
            52582740683_f32.to_ne_bytes().as_slice(),
673
            45827420683_f32.to_ne_bytes().as_slice(),
674
            15406832358_f32.to_f16().to_ne_bytes().as_slice(),
675
            65062358283_f32.to_f16().to_ne_bytes().as_slice(),
676
        ].into_iter().flatten().map(|x| *x).collect();
677
678
        roundtrip_convert_endianness(
679
            data, &channels,
680
            IntegerBounds::from_dimensions((2, 2))
681
        );
682
    }
683
684
    fn roundtrip_convert_endianness(
685
        current_endian: ByteVec, channels: &ChannelList, rectangle: IntegerBounds
686
    ){
687
        let little_endian = convert_current_to_little_endian(
688
            current_endian.clone(), channels, rectangle
689
        ).unwrap();
690
691
        let current_endian_decoded = convert_little_endian_to_current(
692
            little_endian.clone(), channels, rectangle
693
        ).unwrap();
694
695
        assert_eq!(current_endian, current_endian_decoded, "endianness conversion failed");
696
    }
697
}