/rust/registry/src/index.crates.io-1949cf8c6b5b557f/rav1e-0.7.1/src/dist.rs

Source
// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
//
// This source code is subject to the terms of the BSD 2 Clause License and
// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
// was not distributed with this source code in the LICENSE file, you can
// obtain it at www.aomedia.org/license/software. If the Alliance for Open
// Media Patent License 1.0 was not distributed with this source code in the
// PATENTS file, you can obtain it at www.aomedia.org/license/patent.

cfg_if::cfg_if! {
  if #[cfg(nasm_x86_64)] {
    pub use crate::asm::x86::dist::*;
  } else if #[cfg(asm_neon)] {
    pub use crate::asm::aarch64::dist::*;
  } else {
    pub use self::rust::*;
  }
}

pub(crate) mod rust {
  use crate::activity::apply_ssim_boost;
  use crate::cpu_features::CpuFeatureLevel;
  use crate::tiling::*;
  use crate::util::*;

  use crate::encoder::IMPORTANCE_BLOCK_SIZE;
  use crate::rdo::DistortionScale;

  /// Compute the sum of absolute differences over a block.
  /// w and h can be at most 128, the size of the largest block.
  pub fn get_sad<T: Pixel>(
    plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
  ) -> u32 {
    debug_assert!(w <= 128 && h <= 128);
    let plane_org =
      plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
    let plane_ref =
      plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });

    plane_org
      .rows_iter()
      .zip(plane_ref.rows_iter())
      .map(|(src, dst)| {
        src
          .iter()
          .zip(dst)
          .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2)))
          .sum::<u32>()
      })
      .sum()
  }

  #[inline(always)]
  const fn butterfly(a: i32, b: i32) -> (i32, i32) {
    ((a + b), (a - b))
  }

  #[inline(always)]
  #[allow(clippy::identity_op, clippy::erasing_op)]
  fn hadamard4_1d<
    const LEN: usize,
    const N: usize,
    const STRIDE0: usize,
    const STRIDE1: usize,
  >(
    data: &mut [i32; LEN],
  ) {
    for i in 0..N {
      let sub: &mut [i32] = &mut data[i * STRIDE0..];
      let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
      let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
      let (b0, b2) = butterfly(a0, a2);
      let (b1, b3) = butterfly(a1, a3);
      sub[0 * STRIDE1] = b0;
      sub[1 * STRIDE1] = b1;
      sub[2 * STRIDE1] = b2;
      sub[3 * STRIDE1] = b3;
    }
  }

  #[inline(always)]
  #[allow(clippy::identity_op, clippy::erasing_op)]
  fn hadamard8_1d<
    const LEN: usize,
    const N: usize,
    const STRIDE0: usize,
    const STRIDE1: usize,
  >(
    data: &mut [i32; LEN],
  ) {
    for i in 0..N {
      let sub: &mut [i32] = &mut data[i * STRIDE0..];

      let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
      let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
      let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]);
      let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]);

      let (b0, b2) = butterfly(a0, a2);
      let (b1, b3) = butterfly(a1, a3);
      let (b4, b6) = butterfly(a4, a6);
      let (b5, b7) = butterfly(a5, a7);

      let (c0, c4) = butterfly(b0, b4);
      let (c1, c5) = butterfly(b1, b5);
      let (c2, c6) = butterfly(b2, b6);
      let (c3, c7) = butterfly(b3, b7);

      sub[0 * STRIDE1] = c0;
      sub[1 * STRIDE1] = c1;
      sub[2 * STRIDE1] = c2;
      sub[3 * STRIDE1] = c3;
      sub[4 * STRIDE1] = c4;
      sub[5 * STRIDE1] = c5;
      sub[6 * STRIDE1] = c6;
      sub[7 * STRIDE1] = c7;
    }
  }

  #[inline(always)]
  fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
    data: &mut [i32; LEN],
  ) {
    /*Vertical transform.*/
    let vert_func = if H == 4 {
      hadamard4_1d::<LEN, W, 1, H>
    } else {
      hadamard8_1d::<LEN, W, 1, H>
    };
    vert_func(data);
    /*Horizontal transform.*/
    let horz_func = if W == 4 {
      hadamard4_1d::<LEN, H, W, 1>
    } else {
      hadamard8_1d::<LEN, H, W, 1>
    };
    horz_func(data);
  }

  // SAFETY: The length of data must be 16.
  unsafe fn hadamard4x4(data: &mut [i32]) {
    hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16]));
  }

  // SAFETY: The length of data must be 64.
  unsafe fn hadamard8x8(data: &mut [i32]) {
    hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64]));
  }

  /// Sum of absolute transformed differences over a block.
  /// w and h can be at most 128, the size of the largest block.
  /// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but
  /// revert to sad on edges when these transforms do not fit into w and h.
  /// 4x4 transforms instead of 8x8 transforms when width or height < 8.
  pub fn get_satd<T: Pixel>(
    plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
  ) -> u32 {
    assert!(w <= 128 && h <= 128);
    assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
    assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);

    // Size of hadamard transform should be 4x4 or 8x8
    // 4x* and *x4 use 4x4 and all other use 8x8
    let size: usize = w.min(h).min(8);
    let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };

    let mut sum: u64 = 0;

    // Loop over chunks the size of the chosen transform
    for chunk_y in (0..h).step_by(size) {
      let chunk_h = (h - chunk_y).min(size);
      for chunk_x in (0..w).step_by(size) {
        let chunk_w = (w - chunk_x).min(size);
        let chunk_area: Area = Area::Rect {
          x: chunk_x as isize,
          y: chunk_y as isize,
          width: chunk_w,
          height: chunk_h,
        };
        let chunk_org = plane_org.subregion(chunk_area);
        let chunk_ref = plane_ref.subregion(chunk_area);

        // Revert to sad on edge blocks (frame edges)
        if chunk_w != size || chunk_h != size {
          sum += get_sad(
            &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
          ) as u64;
          continue;
        }

        let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];

        // Move the difference of the transforms to a buffer
        for (row_diff, (row_org, row_ref)) in buf
          .chunks_mut(size)
          .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
        {
          for (diff, (a, b)) in
            row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
          {
            *diff = i32::cast_from(*a) - i32::cast_from(*b);
          }
        }

        // Perform the hadamard transform on the differences
        // SAFETY: A sufficient number elements exist for the size of the transform.
        unsafe {
          tx2d(buf);
        }

        // Sum the absolute values of the transformed differences
        sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::<u64>();
      }
    }

    // Normalize the results
    let ln = msb(size as i32) as u64;
    ((sum + (1 << ln >> 1)) >> ln) as u32
  }

  /// Number of bits rounded off before summing in `get_weighted_sse`
  pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8;

  /// Computes weighted sum of squared error.
  ///
  /// Each scale is applied to a 4x4 region in the provided inputs. Each scale
  /// value is a fixed point number, currently [`DistortionScale`].
  ///
  /// Implementations can require alignment (`bw` (block width) for [`src1`] and
  /// [`src2`] and `bw/4` for `scale`).
  #[inline(never)]
  pub fn get_weighted_sse<T: Pixel>(
    src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
    scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
    _cpu: CpuFeatureLevel,
  ) -> u64 {
    let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
    // Always chunk and apply scaling on the sse of squares the size of
    // decimated/sub-sampled importance block sizes.
    // Warning: Changing this will require changing/disabling assembly.
    let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1;

    // Iterator of a row of scales, stretched out to be per row
    let scales = scale.chunks_exact(scale_stride);

    let sse = src1
      .vert_windows(chunk_size)
      .step_by(chunk_size)
      .zip(src2.vert_windows(chunk_size).step_by(chunk_size))
      .zip(scales)
      .map(|((row1, row2), scales)| {
        row1
          .horz_windows(chunk_size)
          .step_by(chunk_size)
          .zip(row2.horz_windows(chunk_size).step_by(chunk_size))
          .zip(scales)
          .map(|((chunk1, chunk2), &scale)| {
            let sum = chunk1
              .rows_iter()
              .zip(chunk2.rows_iter())
              .map(|(chunk_row1, chunk_row2)| {
                chunk_row1
                  .iter()
                  .zip(chunk_row2)
                  .map(|(&a, &b)| {
                    let c = i32::cast_from(a) - i32::cast_from(b);
                    (c * c) as u32
                  })
                  .sum::<u32>()
              })
              .sum::<u32>();
            (sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1))
              >> GET_WEIGHTED_SSE_SHIFT
          })
          .sum::<u64>()
      })
      .sum::<u64>();

    let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64;
    (sse + (den >> 1)) / den
  }

  /// Number of bits of precision used in `AREA_DIVISORS`
  const AREA_DIVISOR_BITS: u8 = 14;

  /// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x)
  #[rustfmt::skip]
  const AREA_DIVISORS: [u16; 64] = [
    16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365,
     1260, 1170, 1092, 1024,  964,  910,  862,  819,  780,  745,  712,  683,
      655,  630,  607,  585,  565,  546,  529,  512,  496,  482,  468,  455,
      443,  431,  420,  410,  400,  390,  381,  372,  364,  356,  349,  341,
      334,  328,  321,  315,  309,  303,  298,  293,  287,  282,  278,  273,
      269,  264,  260,  256,
  ];

  /// Computes a distortion metric of the sum of squares weighted by activity.
  /// w and h should be <= 8.
  #[inline(never)]
  pub fn cdef_dist_kernel<T: Pixel>(
    src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
    bit_depth: usize, _cpu: CpuFeatureLevel,
  ) -> u32 {
    // TODO: Investigate using different constants in ssim boost for block sizes
    // smaller than 8x8.

    debug_assert!(src.plane_cfg.xdec == 0);
    debug_assert!(src.plane_cfg.ydec == 0);
    debug_assert!(dst.plane_cfg.xdec == 0);
    debug_assert!(dst.plane_cfg.ydec == 0);

    // Limit kernel to 8x8
    debug_assert!(w <= 8);
    debug_assert!(h <= 8);

    // Compute the following summations.
    let mut sum_s: u32 = 0; // sum(src_{i,j})
    let mut sum_d: u32 = 0; // sum(dst_{i,j})
    let mut sum_s2: u32 = 0; // sum(src_{i,j}^2)
    let mut sum_d2: u32 = 0; // sum(dst_{i,j}^2)
    let mut sum_sd: u32 = 0; // sum(src_{i,j} * dst_{i,j})
    for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
      for (s, d) in row1[..w].iter().zip(row2) {
        let s: u32 = u32::cast_from(*s);
        let d: u32 = u32::cast_from(*d);
        sum_s += s;
        sum_d += d;

        sum_s2 += s * s;
        sum_d2 += d * d;
        sum_sd += s * d;
      }
    }

    // To get the distortion, compute sum of squared error and apply a weight
    // based on the variance of the two planes.
    let sse = sum_d2 + sum_s2 - 2 * sum_sd;

    // Convert to 64-bits to avoid overflow when squaring
    let sum_s = sum_s as u64;
    let sum_d = sum_d as u64;

    // Calculate the variance (more accurately variance*area) of each plane.
    // var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2
    //    (n = # samples i.e. area)
    // var * n = sum(X^2) - sum(X)^2 / n
    // When w and h are powers of two, this can be done via shifting.
    let div = AREA_DIVISORS[w * h - 1] as u64;
    let div_shift = AREA_DIVISOR_BITS;
    // Due to rounding, negative values can occur when w or h aren't powers of
    // two. Saturate to avoid underflow.
    let mut svar = sum_s2.saturating_sub(
      ((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32,
    );
    let mut dvar = sum_d2.saturating_sub(
      ((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32,
    );

    // Scale variances up to 8x8 size.
    //   scaled variance = var * (8x8) / wxh
    // For 8x8, this is a nop. For powers of 2, this is doable with shifting.
    // TODO: It should be possible and faster to do this adjustment in ssim boost
    let scale_shift = AREA_DIVISOR_BITS - 6;
    svar =
      ((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
    dvar =
      ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;

    apply_ssim_boost(sse, svar, dvar, bit_depth)
  }
}

#[cfg(test)]
pub mod test {
  use super::*;
  use crate::cpu_features::CpuFeatureLevel;
  use crate::frame::*;
  use crate::tiling::Area;
  use crate::util::Pixel;

  // Generate plane data for get_sad_same()
  fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
    // Two planes with different strides
    let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
    let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);

    // Make the test pattern robust to data alignment
    let xpad_off =
      (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;

    for (i, row) in
      input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
    {
      for (j, pixel) in row.iter_mut().enumerate() {
        let val = ((j + i) as i32 - xpad_off) & 255i32;
        assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
        *pixel = T::cast_from(val);
      }
    }

    for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
    {
      for (j, pixel) in row.iter_mut().enumerate() {
        let val = (j as i32 - i as i32 - xpad_off) & 255i32;
        assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
        *pixel = T::cast_from(val);
      }
    }

    (input_plane, rec_plane)
  }

  // Regression and validation test for SAD computation
  fn get_sad_same_inner<T: Pixel>() {
    // dynamic allocation: test
    let blocks: Vec<(usize, usize, u32)> = vec![
      (4, 4, 1912),
      (4, 8, 4296),
      (8, 4, 3496),
      (8, 8, 7824),
      (8, 16, 16592),
      (16, 8, 14416),
      (16, 16, 31136),
      (16, 32, 60064),
      (32, 16, 59552),
      (32, 32, 120128),
      (32, 64, 186688),
      (64, 32, 250176),
      (64, 64, 438912),
      (64, 128, 654272),
      (128, 64, 1016768),
      (128, 128, 1689792),
      (4, 16, 8680),
      (16, 4, 6664),
      (8, 32, 31056),
      (32, 8, 27600),
      (16, 64, 93344),
      (64, 16, 116384),
    ];

    let bit_depth: usize = 8;
    let (input_plane, rec_plane) = setup_planes::<T>();

    for (w, h, distortion) in blocks {
      let area = Area::StartingAt { x: 32, y: 40 };

      let input_region = input_plane.region(area);
      let rec_region = rec_plane.region(area);

      assert_eq!(
        distortion,
        get_sad(
          &input_region,
          &rec_region,
          w,
          h,
          bit_depth,
          CpuFeatureLevel::default()
        )
      );
    }
  }

  #[test]
  fn get_sad_same_u8() {
    get_sad_same_inner::<u8>();
  }

  #[test]
  fn get_sad_same_u16() {
    get_sad_same_inner::<u16>();
  }

  fn get_satd_same_inner<T: Pixel>() {
    let blocks: Vec<(usize, usize, u32)> = vec![
      (4, 4, 1408),
      (4, 8, 2016),
      (8, 4, 1816),
      (8, 8, 3984),
      (8, 16, 5136),
      (16, 8, 4864),
      (16, 16, 9984),
      (16, 32, 13824),
      (32, 16, 13760),
      (32, 32, 27952),
      (32, 64, 37168),
      (64, 32, 45104),
      (64, 64, 84176),
      (64, 128, 127920),
      (128, 64, 173680),
      (128, 128, 321456),
      (4, 16, 3136),
      (16, 4, 2632),
      (8, 32, 7056),
      (32, 8, 6624),
      (16, 64, 18432),
      (64, 16, 21312),
    ];

    let bit_depth: usize = 8;
    let (input_plane, rec_plane) = setup_planes::<T>();

    for (w, h, distortion) in blocks {
      let area = Area::StartingAt { x: 32, y: 40 };

      let input_region = input_plane.region(area);
      let rec_region = rec_plane.region(area);

      assert_eq!(
        distortion,
        get_satd(
          &input_region,
          &rec_region,
          w,
          h,
          bit_depth,
          CpuFeatureLevel::default()
        )
      );
    }
  }

  #[test]
  fn get_satd_same_u8() {
    get_satd_same_inner::<u8>();
  }

  #[test]
  fn get_satd_same_u16() {
    get_satd_same_inner::<u16>();
  }
}

Coverage Report

Created: 2025-11-05 08:08

Line	Count	Source
1		// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
2		//
3		// This source code is subject to the terms of the BSD 2 Clause License and
4		// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5		// was not distributed with this source code in the LICENSE file, you can
6		// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7		// Media Patent License 1.0 was not distributed with this source code in the
8		// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10		cfg_if::cfg_if! {
11		if #[cfg(nasm_x86_64)] {
12		pub use crate::asm::x86::dist::*;
13		} else if #[cfg(asm_neon)] {
14		pub use crate::asm::aarch64::dist::*;
15		} else {
16		pub use self::rust::*;
17		}
18		}
19
20		pub(crate) mod rust {
21		use crate::activity::apply_ssim_boost;
22		use crate::cpu_features::CpuFeatureLevel;
23		use crate::tiling::*;
24		use crate::util::*;
25
26		use crate::encoder::IMPORTANCE_BLOCK_SIZE;
27		use crate::rdo::DistortionScale;
28
29		/// Compute the sum of absolute differences over a block.
30		/// w and h can be at most 128, the size of the largest block.
31	0	pub fn get_sad<T: Pixel>(
32	0	plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
33	0	h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
34	0	) -> u32 {
35	0	debug_assert!(w <= 128 && h <= 128);
36	0	let plane_org =
37	0	plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
38	0	let plane_ref =
39	0	plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
40
41	0	plane_org
42	0	.rows_iter()
43	0	.zip(plane_ref.rows_iter())
44	0	.map(\|(src, dst)\| {
45	0	src
46	0	.iter()
47	0	.zip(dst)
48	0	.map(\|(&p1, &p2)\| i32::cast_from(p1).abs_diff(i32::cast_from(p2))) Unexecuted instantiation: rav1e::dist::rust::get_sad::<u16>::{closure#0}::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_sad::<u8>::{closure#0}::{closure#0}
49	0	.sum::<u32>()
50	0	}) Unexecuted instantiation: rav1e::dist::rust::get_sad::<u16>::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_sad::<u8>::{closure#0}
51	0	.sum()
52	0	} Unexecuted instantiation: rav1e::dist::rust::get_sad::<u16> Unexecuted instantiation: rav1e::dist::rust::get_sad::<u8>
53
54		#[inline(always)]
55	0	const fn butterfly(a: i32, b: i32) -> (i32, i32) {
56	0	((a + b), (a - b))
57	0	}
58
59		#[inline(always)]
60		#[allow(clippy::identity_op, clippy::erasing_op)]
61	0	fn hadamard4_1d<
62	0	const LEN: usize,
63	0	const N: usize,
64	0	const STRIDE0: usize,
65	0	const STRIDE1: usize,
66	0	>(
67	0	data: &mut [i32; LEN],
68	0	) {
69	0	for i in 0..N {
70	0	let sub: &mut [i32] = &mut data[i * STRIDE0..];
71	0	let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
72	0	let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
73	0	let (b0, b2) = butterfly(a0, a2);
74	0	let (b1, b3) = butterfly(a1, a3);
75	0	sub[0 * STRIDE1] = b0;
76	0	sub[1 * STRIDE1] = b1;
77	0	sub[2 * STRIDE1] = b2;
78	0	sub[3 * STRIDE1] = b3;
79	0	}
80	0	} Unexecuted instantiation: rav1e::dist::rust::hadamard4_1d::<16, 4, 4, 1> Unexecuted instantiation: rav1e::dist::rust::hadamard4_1d::<16, 4, 1, 4> Unexecuted instantiation: rav1e::dist::rust::hadamard4_1d::<64, 8, 8, 1> Unexecuted instantiation: rav1e::dist::rust::hadamard4_1d::<64, 8, 1, 8>
81
82		#[inline(always)]
83		#[allow(clippy::identity_op, clippy::erasing_op)]
84	0	fn hadamard8_1d<
85	0	const LEN: usize,
86	0	const N: usize,
87	0	const STRIDE0: usize,
88	0	const STRIDE1: usize,
89	0	>(
90	0	data: &mut [i32; LEN],
91	0	) {
92	0	for i in 0..N {
93	0	let sub: &mut [i32] = &mut data[i * STRIDE0..];
94	0
95	0	let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
96	0	let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
97	0	let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]);
98	0	let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]);
99	0
100	0	let (b0, b2) = butterfly(a0, a2);
101	0	let (b1, b3) = butterfly(a1, a3);
102	0	let (b4, b6) = butterfly(a4, a6);
103	0	let (b5, b7) = butterfly(a5, a7);
104	0
105	0	let (c0, c4) = butterfly(b0, b4);
106	0	let (c1, c5) = butterfly(b1, b5);
107	0	let (c2, c6) = butterfly(b2, b6);
108	0	let (c3, c7) = butterfly(b3, b7);
109	0
110	0	sub[0 * STRIDE1] = c0;
111	0	sub[1 * STRIDE1] = c1;
112	0	sub[2 * STRIDE1] = c2;
113	0	sub[3 * STRIDE1] = c3;
114	0	sub[4 * STRIDE1] = c4;
115	0	sub[5 * STRIDE1] = c5;
116	0	sub[6 * STRIDE1] = c6;
117	0	sub[7 * STRIDE1] = c7;
118	0	}
119	0	} Unexecuted instantiation: rav1e::dist::rust::hadamard8_1d::<16, 4, 4, 1> Unexecuted instantiation: rav1e::dist::rust::hadamard8_1d::<16, 4, 1, 4> Unexecuted instantiation: rav1e::dist::rust::hadamard8_1d::<64, 8, 8, 1> Unexecuted instantiation: rav1e::dist::rust::hadamard8_1d::<64, 8, 1, 8>
120
121		#[inline(always)]
122	0	fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
123	0	data: &mut [i32; LEN],
124	0	) {
125		/Vertical transform./
126	0	let vert_func = if H == 4 {
127	0	hadamard4_1d::<LEN, W, 1, H>
128	0	} else {
129	0	hadamard8_1d::<LEN, W, 1, H>
130	0	};
131	0	vert_func(data);
132		/Horizontal transform./
133	0	let horz_func = if W == 4 {
134	0	hadamard4_1d::<LEN, H, W, 1>
135	0	} else {
136	0	hadamard8_1d::<LEN, H, W, 1>
137	0	};
138	0	horz_func(data);
139	0	} Unexecuted instantiation: rav1e::dist::rust::hadamard2d::<16, 4, 4> Unexecuted instantiation: rav1e::dist::rust::hadamard2d::<64, 8, 8>
140
141		// SAFETY: The length of data must be 16.
142	0	unsafe fn hadamard4x4(data: &mut [i32]) {
143	0	hadamard2d::<{ 4 * 4 }, 4, 4>(&mut (data.as_mut_ptr() as mut [i32; 16]));
144	0	}
145
146		// SAFETY: The length of data must be 64.
147	0	unsafe fn hadamard8x8(data: &mut [i32]) {
148	0	hadamard2d::<{ 8 * 8 }, 8, 8>(&mut (data.as_mut_ptr() as mut [i32; 64]));
149	0	}
150
151		/// Sum of absolute transformed differences over a block.
152		/// w and h can be at most 128, the size of the largest block.
153		/// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but
154		/// revert to sad on edges when these transforms do not fit into w and h.
155		/// 4x4 transforms instead of 8x8 transforms when width or height < 8.
156	0	pub fn get_satd<T: Pixel>(
157	0	plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
158	0	h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
159	0	) -> u32 {
160	0	assert!(w <= 128 && h <= 128);
161	0	assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
162	0	assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
163
164		// Size of hadamard transform should be 4x4 or 8x8
165		// 4x* and *x4 use 4x4 and all other use 8x8
166	0	let size: usize = w.min(h).min(8);
167	0	let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
168
169	0	let mut sum: u64 = 0;
170
171		// Loop over chunks the size of the chosen transform
172	0	for chunk_y in (0..h).step_by(size) {
173	0	let chunk_h = (h - chunk_y).min(size);
174	0	for chunk_x in (0..w).step_by(size) {
175	0	let chunk_w = (w - chunk_x).min(size);
176	0	let chunk_area: Area = Area::Rect {
177	0	x: chunk_x as isize,
178	0	y: chunk_y as isize,
179	0	width: chunk_w,
180	0	height: chunk_h,
181	0	};
182	0	let chunk_org = plane_org.subregion(chunk_area);
183	0	let chunk_ref = plane_ref.subregion(chunk_area);
184
185		// Revert to sad on edge blocks (frame edges)
186	0	if chunk_w != size \|\| chunk_h != size {
187	0	sum += get_sad(
188	0	&chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
189	0	) as u64;
190	0	continue;
191	0	}
192
193	0	let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
194
195		// Move the difference of the transforms to a buffer
196	0	for (row_diff, (row_org, row_ref)) in buf
197	0	.chunks_mut(size)
198	0	.zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
199		{
200	0	for (diff, (a, b)) in
201	0	row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
202	0	{
203	0	diff = i32::cast_from(a) - i32::cast_from(*b);
204	0	}
205		}
206
207		// Perform the hadamard transform on the differences
208		// SAFETY: A sufficient number elements exist for the size of the transform.
209	0	unsafe {
210	0	tx2d(buf);
211	0	}
212
213		// Sum the absolute values of the transformed differences
214	0	sum += buf.iter().map(\|a\| a.unsigned_abs() as u64).sum::<u64>(); Unexecuted instantiation: rav1e::dist::rust::get_satd::<u16>::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_satd::<u8>::{closure#0}
215		}
216		}
217
218		// Normalize the results
219	0	let ln = msb(size as i32) as u64;
220	0	((sum + (1 << ln >> 1)) >> ln) as u32
221	0	} Unexecuted instantiation: rav1e::dist::rust::get_satd::<u16> Unexecuted instantiation: rav1e::dist::rust::get_satd::<u8>
222
223		/// Number of bits rounded off before summing in `get_weighted_sse`
224		pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8;
225
226		/// Computes weighted sum of squared error.
227		///
228		/// Each scale is applied to a 4x4 region in the provided inputs. Each scale
229		/// value is a fixed point number, currently [`DistortionScale`].
230		///
231		/// Implementations can require alignment (`bw` (block width) for [`src1`] and
232		/// [`src2`] and `bw/4` for `scale`).
233		#[inline(never)]
234	0	pub fn get_weighted_sse<T: Pixel>(
235	0	src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
236	0	scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
237	0	_cpu: CpuFeatureLevel,
238	0	) -> u64 {
239	0	let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
240		// Always chunk and apply scaling on the sse of squares the size of
241		// decimated/sub-sampled importance block sizes.
242		// Warning: Changing this will require changing/disabling assembly.
243	0	let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1;
244
245		// Iterator of a row of scales, stretched out to be per row
246	0	let scales = scale.chunks_exact(scale_stride);
247
248	0	let sse = src1
249	0	.vert_windows(chunk_size)
250	0	.step_by(chunk_size)
251	0	.zip(src2.vert_windows(chunk_size).step_by(chunk_size))
252	0	.zip(scales)
253	0	.map(\|((row1, row2), scales)\| {
254	0	row1
255	0	.horz_windows(chunk_size)
256	0	.step_by(chunk_size)
257	0	.zip(row2.horz_windows(chunk_size).step_by(chunk_size))
258	0	.zip(scales)
259	0	.map(\|((chunk1, chunk2), &scale)\| {
260	0	let sum = chunk1
261	0	.rows_iter()
262	0	.zip(chunk2.rows_iter())
263	0	.map(\|(chunk_row1, chunk_row2)\| {
264	0	chunk_row1
265	0	.iter()
266	0	.zip(chunk_row2)
267	0	.map(\|(&a, &b)\| {
268	0	let c = i32::cast_from(a) - i32::cast_from(b);
269	0	(c * c) as u32
270	0	}) Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u16>::{closure#0}::{closure#0}::{closure#0}::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u8>::{closure#0}::{closure#0}::{closure#0}::{closure#0}
271	0	.sum::<u32>()
272	0	}) Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u16>::{closure#0}::{closure#0}::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u8>::{closure#0}::{closure#0}::{closure#0}
273	0	.sum::<u32>();
274	0	(sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1))
275	0	>> GET_WEIGHTED_SSE_SHIFT
276	0	}) Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u16>::{closure#0}::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u8>::{closure#0}::{closure#0}
277	0	.sum::<u64>()
278	0	}) Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u16>::{closure#0} Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u8>::{closure#0}
279	0	.sum::<u64>();
280
281	0	let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64;
282	0	(sse + (den >> 1)) / den
283	0	} Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u16> Unexecuted instantiation: rav1e::dist::rust::get_weighted_sse::<u8>
284
285		/// Number of bits of precision used in `AREA_DIVISORS`
286		const AREA_DIVISOR_BITS: u8 = 14;
287
288		/// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x)
289		#[rustfmt::skip]
290		const AREA_DIVISORS: [u16; 64] = [
291		16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365,
292		1260, 1170, 1092, 1024, 964, 910, 862, 819, 780, 745, 712, 683,
293		655, 630, 607, 585, 565, 546, 529, 512, 496, 482, 468, 455,
294		443, 431, 420, 410, 400, 390, 381, 372, 364, 356, 349, 341,
295		334, 328, 321, 315, 309, 303, 298, 293, 287, 282, 278, 273,
296		269, 264, 260, 256,
297		];
298
299		/// Computes a distortion metric of the sum of squares weighted by activity.
300		/// w and h should be <= 8.
301		#[inline(never)]
302	0	pub fn cdef_dist_kernel<T: Pixel>(
303	0	src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
304	0	bit_depth: usize, _cpu: CpuFeatureLevel,
305	0	) -> u32 {
306		// TODO: Investigate using different constants in ssim boost for block sizes
307		// smaller than 8x8.
308
309	0	debug_assert!(src.plane_cfg.xdec == 0);
310	0	debug_assert!(src.plane_cfg.ydec == 0);
311	0	debug_assert!(dst.plane_cfg.xdec == 0);
312	0	debug_assert!(dst.plane_cfg.ydec == 0);
313
314		// Limit kernel to 8x8
315	0	debug_assert!(w <= 8);
316	0	debug_assert!(h <= 8);
317
318		// Compute the following summations.
319	0	let mut sum_s: u32 = 0; // sum(src_{i,j})
320	0	let mut sum_d: u32 = 0; // sum(dst_{i,j})
321	0	let mut sum_s2: u32 = 0; // sum(src_{i,j}^2)
322	0	let mut sum_d2: u32 = 0; // sum(dst_{i,j}^2)
323	0	let mut sum_sd: u32 = 0; // sum(src_{i,j} * dst_{i,j})
324	0	for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
325	0	for (s, d) in row1[..w].iter().zip(row2) {
326	0	let s: u32 = u32::cast_from(*s);
327	0	let d: u32 = u32::cast_from(*d);
328	0	sum_s += s;
329	0	sum_d += d;
330	0
331	0	sum_s2 += s * s;
332	0	sum_d2 += d * d;
333	0	sum_sd += s * d;
334	0	}
335		}
336
337		// To get the distortion, compute sum of squared error and apply a weight
338		// based on the variance of the two planes.
339	0	let sse = sum_d2 + sum_s2 - 2 * sum_sd;
340
341		// Convert to 64-bits to avoid overflow when squaring
342	0	let sum_s = sum_s as u64;
343	0	let sum_d = sum_d as u64;
344
345		// Calculate the variance (more accurately variance*area) of each plane.
346		// var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2
347		// (n = # samples i.e. area)
348		// var * n = sum(X^2) - sum(X)^2 / n
349		// When w and h are powers of two, this can be done via shifting.
350	0	let div = AREA_DIVISORS[w * h - 1] as u64;
351	0	let div_shift = AREA_DIVISOR_BITS;
352		// Due to rounding, negative values can occur when w or h aren't powers of
353		// two. Saturate to avoid underflow.
354	0	let mut svar = sum_s2.saturating_sub(
355	0	((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32,
356		);
357	0	let mut dvar = sum_d2.saturating_sub(
358	0	((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32,
359		);
360
361		// Scale variances up to 8x8 size.
362		// scaled variance = var * (8x8) / wxh
363		// For 8x8, this is a nop. For powers of 2, this is doable with shifting.
364		// TODO: It should be possible and faster to do this adjustment in ssim boost
365	0	let scale_shift = AREA_DIVISOR_BITS - 6;
366	0	svar =
367	0	((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
368	0	dvar =
369	0	((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
370
371	0	apply_ssim_boost(sse, svar, dvar, bit_depth)
372	0	} Unexecuted instantiation: rav1e::dist::rust::cdef_dist_kernel::<u16> Unexecuted instantiation: rav1e::dist::rust::cdef_dist_kernel::<u8>
373		}
374
375		#[cfg(test)]
376		pub mod test {
377		use super::*;
378		use crate::cpu_features::CpuFeatureLevel;
379		use crate::frame::*;
380		use crate::tiling::Area;
381		use crate::util::Pixel;
382
383		// Generate plane data for get_sad_same()
384		fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
385		// Two planes with different strides
386		let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
387		let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);
388
389		// Make the test pattern robust to data alignment
390		let xpad_off =
391		(input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;
392
393		for (i, row) in
394		input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
395		{
396		for (j, pixel) in row.iter_mut().enumerate() {
397		let val = ((j + i) as i32 - xpad_off) & 255i32;
398		assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
399		*pixel = T::cast_from(val);
400		}
401		}
402
403		for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
404		{
405		for (j, pixel) in row.iter_mut().enumerate() {
406		let val = (j as i32 - i as i32 - xpad_off) & 255i32;
407		assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
408		*pixel = T::cast_from(val);
409		}
410		}
411
412		(input_plane, rec_plane)
413		}
414
415		// Regression and validation test for SAD computation
416		fn get_sad_same_inner<T: Pixel>() {
417		// dynamic allocation: test
418		let blocks: Vec<(usize, usize, u32)> = vec![
419		(4, 4, 1912),
420		(4, 8, 4296),
421		(8, 4, 3496),
422		(8, 8, 7824),
423		(8, 16, 16592),
424		(16, 8, 14416),
425		(16, 16, 31136),
426		(16, 32, 60064),
427		(32, 16, 59552),
428		(32, 32, 120128),
429		(32, 64, 186688),
430		(64, 32, 250176),
431		(64, 64, 438912),
432		(64, 128, 654272),
433		(128, 64, 1016768),
434		(128, 128, 1689792),
435		(4, 16, 8680),
436		(16, 4, 6664),
437		(8, 32, 31056),
438		(32, 8, 27600),
439		(16, 64, 93344),
440		(64, 16, 116384),
441		];
442
443		let bit_depth: usize = 8;
444		let (input_plane, rec_plane) = setup_planes::<T>();
445
446		for (w, h, distortion) in blocks {
447		let area = Area::StartingAt { x: 32, y: 40 };
448
449		let input_region = input_plane.region(area);
450		let rec_region = rec_plane.region(area);
451
452		assert_eq!(
453		distortion,
454		get_sad(
455		&input_region,
456		&rec_region,
457		w,
458		h,
459		bit_depth,
460		CpuFeatureLevel::default()
461		)
462		);
463		}
464		}
465
466		#[test]
467		fn get_sad_same_u8() {
468		get_sad_same_inner::<u8>();
469		}
470
471		#[test]
472		fn get_sad_same_u16() {
473		get_sad_same_inner::<u16>();
474		}
475
476		fn get_satd_same_inner<T: Pixel>() {
477		let blocks: Vec<(usize, usize, u32)> = vec![
478		(4, 4, 1408),
479		(4, 8, 2016),
480		(8, 4, 1816),
481		(8, 8, 3984),
482		(8, 16, 5136),
483		(16, 8, 4864),
484		(16, 16, 9984),
485		(16, 32, 13824),
486		(32, 16, 13760),
487		(32, 32, 27952),
488		(32, 64, 37168),
489		(64, 32, 45104),
490		(64, 64, 84176),
491		(64, 128, 127920),
492		(128, 64, 173680),
493		(128, 128, 321456),
494		(4, 16, 3136),
495		(16, 4, 2632),
496		(8, 32, 7056),
497		(32, 8, 6624),
498		(16, 64, 18432),
499		(64, 16, 21312),
500		];
501
502		let bit_depth: usize = 8;
503		let (input_plane, rec_plane) = setup_planes::<T>();
504
505		for (w, h, distortion) in blocks {
506		let area = Area::StartingAt { x: 32, y: 40 };
507
508		let input_region = input_plane.region(area);
509		let rec_region = rec_plane.region(area);
510
511		assert_eq!(
512		distortion,
513		get_satd(
514		&input_region,
515		&rec_region,
516		w,
517		h,
518		bit_depth,
519		CpuFeatureLevel::default()
520		)
521		);
522		}
523		}
524
525		#[test]
526		fn get_satd_same_u8() {
527		get_satd_same_inner::<u8>();
528		}
529
530		#[test]
531		fn get_satd_same_u16() {
532		get_satd_same_inner::<u16>();
533		}
534		}