/work/svt-av1/Source/Lib/Codec/cdef.c

Source
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
 */

#include "cdef.h"
#include "common_dsp_rtcd.h"
#include "bitstream_unit.h"

static INLINE int32_t sign(int32_t i) {
    return i < 0 ? -1 : 1;
}

static INLINE int32_t constrain(int32_t diff, int32_t threshold, int32_t damping) {
    if (!threshold) {
        return 0;
    }

    const int32_t shift = AOMMAX(0, damping - get_msb(threshold));
    return sign(diff) * AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
}

/*
This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
beginning and end of the table. The cdef direction range is [0, 7] and the
first index is offset +/-2. This removes the need to constrain the first
index to the same range using e.g., & 7.
*/
DECLARE_ALIGNED(16, const int, eb_cdef_directions_padded[12][2]) = {
    /* Padding: svt_aom_eb_cdef_directions[6] */
    {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0},
    /* Padding: svt_aom_eb_cdef_directions[7] */
    {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1},

    /* Begin svt_aom_eb_cdef_directions */
    {-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2},
    {0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2},
    {0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2},
    {0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2},
    {1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2},
    {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1},
    {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0},
    {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1},
    /* End svt_aom_eb_cdef_directions */

    /* Padding: svt_aom_eb_cdef_directions[0] */
    {-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2},
    /* Padding: svt_aom_eb_cdef_directions[1] */
    {0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2},
};

const int (*const svt_aom_eb_cdef_directions)[2] = eb_cdef_directions_padded + 2;

/* Compute the primary filter strength for an 8x8 block based on the
directional variance difference. A high variance difference means
that we have a highly directional pattern (e.g. a high contrast
edge), so we can apply more deringing. A low variance means that we
either have a low contrast edge, or a non-directional texture, so
we want to be careful not to blur. */
static INLINE int32_t adjust_strength(int32_t strength, int32_t var) {
    const int32_t i = (var >> 6) ? AOMMIN(get_msb(var >> 6), 12) : 0;
    /* We use the variance of 8x8 blocks to adjust the strength. */
    return var ? (strength * (4 + i) + 8) >> 4 : 0;
}

void svt_aom_copy_rect8_8bit_to_16bit_c(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t sstride, int32_t v,
                                        int32_t h) {
    for (int32_t i = 0; i < v; i++) {
        for (int32_t j = 0; j < h; j++) {
            dst[i * dstride + j] = src[i * sstride + j];
        }
    }
}

/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
The search minimizes the weighted variance along all the lines in a
particular direction, i.e. the squared error between the input and a
"predicted" block where each pixel is replaced by the average along a line
in a particular direction. Since each direction have the same sum(x^2) term,
that term is never computed. See Section 2, step 2, of:
http://jmvalin.ca/notes/intra_paint.pdf */
uint8_t svt_aom_cdef_find_dir_c(const uint16_t* img, int32_t stride, int32_t* var, int32_t coeff_shift) {
    int32_t cost[8]        = {0};
    int32_t partial[8][15] = {{0}};
    int32_t best_cost      = 0;
    uint8_t i;
    uint8_t best_dir = 0;
    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
    The output is then 840 times larger, but we don't care for finding
    the max. */
    static const int32_t div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
    for (i = 0; i < 8; i++) {
        int32_t j;
        for (j = 0; j < 8; j++) {
            int32_t x;
            /* We subtract 128 here to reduce the maximum range of the squared
            partial sums. */
            x = (img[i * stride + j] >> coeff_shift) - 128;
            partial[0][i + j] += x;
            partial[1][i + j / 2] += x;
            partial[2][i] += x;
            partial[3][3 + i - j / 2] += x;
            partial[4][7 + i - j] += x;
            partial[5][3 - i / 2 + j] += x;
            partial[6][j] += x;
            partial[7][i / 2 + j] += x;
        }
    }
    for (i = 0; i < 8; i++) {
        cost[2] += partial[2][i] * partial[2][i];
        cost[6] += partial[6][i] * partial[6][i];
    }
    cost[2] *= div_table[8];
    cost[6] *= div_table[8];
    for (i = 0; i < 7; i++) {
        cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1];
        cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1];
    }
    cost[0] += partial[0][7] * partial[0][7] * div_table[8];
    cost[4] += partial[4][7] * partial[4][7] * div_table[8];
    for (i = 1; i < 8; i += 2) {
        int32_t j;
        for (j = 0; j < 4 + 1; j++) {
            cost[i] += partial[i][3 + j] * partial[i][3 + j];
        }
        cost[i] *= div_table[8];
        for (j = 0; j < 4 - 1; j++) {
            cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2];
        }
    }
    for (i = 0; i < 8; i++) {
        if (cost[i] > best_cost) {
            best_cost = cost[i];
            best_dir  = i;
        }
    }
    /* Difference between the optimal variance and the variance along the
    orthogonal direction. Again, the sum(x^2) terms cancel out. */
    *var = best_cost - cost[(best_dir + 4) & 7];
    /* We'd normally divide by 840, but dividing by 1024 is close enough
    for what we're going to do with this. */
    *var >>= 10;
    return best_dir;
}

void svt_aom_cdef_find_dir_dual_c(const uint16_t* img1, const uint16_t* img2, int stride, int32_t* var1, int32_t* var2,
                                  int32_t coeff_shift, uint8_t* out1, uint8_t* out2) {
    *out1 = svt_aom_cdef_find_dir_c(img1, stride, var1, coeff_shift);
    *out2 = svt_aom_cdef_find_dir_c(img2, stride, var2, coeff_shift);
}

static AOM_INLINE void cdef_find_dir(uint16_t* in, CdefList* dlist, int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS],
                                     int32_t cdef_count, int32_t coeff_shift, uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
    int bi;

    // Find direction of two 8x8 blocks together.
    for (bi = 0; bi < cdef_count - 1; bi += 2) {
        const uint8_t by   = dlist[bi].by;
        const uint8_t bx   = dlist[bi].bx;
        const uint8_t by2  = dlist[bi + 1].by;
        const uint8_t bx2  = dlist[bi + 1].bx;
        const int     pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
        const int     pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
        svt_aom_cdef_find_dir_dual(&in[pos1],
                                   &in[pos2],
                                   CDEF_BSTRIDE,
                                   &var[by][bx],
                                   &var[by2][bx2],
                                   coeff_shift,
                                   &dir[by][bx],
                                   &dir[by2][bx2]);
    }

    // Process remaining 8x8 blocks here. One 8x8 at a time.
    if (cdef_count % 2) {
        const uint8_t by = dlist[bi].by;
        const uint8_t bx = dlist[bi].bx;
        dir[by][bx]      = svt_aom_cdef_find_dir(
            &in[8 * by * CDEF_BSTRIDE + 8 * bx], CDEF_BSTRIDE, &var[by][bx], coeff_shift);
    }
}

const int32_t svt_aom_eb_cdef_pri_taps[2][2] = {{4, 2}, {3, 3}};
const int32_t svt_aom_eb_cdef_sec_taps[2][2] = {{2, 1}, {2, 1}};

/* Smooth in the direction detected. */
void svt_cdef_filter_block_c(uint8_t* dst8, uint16_t* dst16, int32_t dstride, const uint16_t* in, int32_t pri_strength,
                             int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping, int32_t bsize,
                             int32_t coeff_shift, uint8_t subsampling_factor) {
    int32_t        i, j, k;
    const int32_t  s        = CDEF_BSTRIDE;
    const int32_t* pri_taps = svt_aom_eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
    const int32_t* sec_taps = svt_aom_eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];

    for (i = 0; i < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_4X8)); i += subsampling_factor) {
        for (j = 0; j < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_8X4)); j++) {
            int16_t sum = 0;
            int16_t y;
            int16_t x   = in[i * s + j];
            int32_t max = x;
            int32_t min = x;
            for (k = 0; k < 2; k++) {
                int16_t p0 = in[i * s + j + svt_aom_eb_cdef_directions[dir][k]];
                int16_t p1 = in[i * s + j - svt_aom_eb_cdef_directions[dir][k]];
                sum += (int16_t)(pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping));
                sum += (int16_t)(pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping));
                if (p0 != CDEF_VERY_LARGE) {
                    max = AOMMAX(p0, max);
                }
                if (p1 != CDEF_VERY_LARGE) {
                    max = AOMMAX(p1, max);
                }
                min        = AOMMIN(p0, min);
                min        = AOMMIN(p1, min);
                int16_t s0 = in[i * s + j + svt_aom_eb_cdef_directions[(dir + 2)][k]];
                int16_t s1 = in[i * s + j - svt_aom_eb_cdef_directions[(dir + 2)][k]];
                int16_t s2 = in[i * s + j + svt_aom_eb_cdef_directions[(dir - 2)][k]];
                int16_t s3 = in[i * s + j - svt_aom_eb_cdef_directions[(dir - 2)][k]];
                if (s0 != CDEF_VERY_LARGE) {
                    max = AOMMAX(s0, max);
                }
                if (s1 != CDEF_VERY_LARGE) {
                    max = AOMMAX(s1, max);
                }
                if (s2 != CDEF_VERY_LARGE) {
                    max = AOMMAX(s2, max);
                }
                if (s3 != CDEF_VERY_LARGE) {
                    max = AOMMAX(s3, max);
                }
                min = AOMMIN(s0, min);
                min = AOMMIN(s1, min);
                min = AOMMIN(s2, min);
                min = AOMMIN(s3, min);
                sum += (int16_t)(sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping));
                sum += (int16_t)(sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping));
                sum += (int16_t)(sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping));
                sum += (int16_t)(sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping));
            }
            y = (int16_t)clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
            if (dst8) {
                dst8[i * dstride + j] = (uint8_t)y;
            } else {
                dst16[i * dstride + j] = (uint16_t)y;
            }
        }
    }
}

void svt_aom_copy_sb8_16(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t src_voffset, int32_t src_hoffset,
                         int32_t sstride, int32_t vsize, int32_t hsize, bool is_16bit) {
    if (is_16bit) {
        const uint16_t* base = ((uint16_t*)src) + (src_voffset * sstride + src_hoffset);
        for (int r = 0; r < vsize; r++) {
            svt_memcpy(dst, base, 2 * hsize);
            dst += dstride;
            base += sstride;
        }
    } else {
        const uint8_t* base = &src[src_voffset * sstride + src_hoffset];
        svt_aom_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
    }
}

/*
 * Loop over the non-skip 8x8 blocks.  For each block, find the CDEF direction, then apply the specified filter.
*/
void svt_cdef_filter_fb(uint8_t* dst8, uint16_t* dst16, int32_t dstride, uint16_t* in, int32_t xdec, int32_t ydec,
                        uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t* dirinit,
                        int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t pli, CdefList* dlist, int32_t cdef_count,
                        int32_t level, int32_t sec_strength, int32_t pri_damping, int32_t sec_damping,
                        int32_t coeff_shift, uint8_t subsampling_factor) {
    int32_t bi;
    int32_t pri_strength = level << coeff_shift;
    sec_strength <<= coeff_shift;
    sec_damping += coeff_shift - (pli != PLANE_Y);
    pri_damping += coeff_shift - (pli != PLANE_Y);

    int32_t bsize  = ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
    int32_t bsizex = 3 - xdec;
    int32_t bsizey = 3 - ydec;

    if (!dstride && pri_strength == 0 && sec_strength == 0) {
        // If we're here, both primary and secondary strengths are 0, and
        // we still haven't written anything to y[] yet, so we just copy
        // the input to y[]. This is necessary only for svt_av1_cdef_search()
        // and only svt_av1_cdef_search() sets dirinit.
        for (bi = 0; bi < cdef_count; bi++) {
            int32_t   by = dlist[bi].by << bsizey;
            int32_t   bx = dlist[bi].bx << bsizex;
            int32_t   iy;
            uint16_t* src_16 = in + (by * CDEF_BSTRIDE + bx);
            if (dst8) {
                uint8_t* dst_8 = dst8 + (bi << (bsizex + bsizey));
                //size 2x2 and 3x3, no gain to use SIMD
                for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) {
                    for (int32_t ix = 0; ix < 1 << bsizex; ix++) {
                        dst_8[(iy << bsizex) + ix] = (uint8_t)src_16[iy * CDEF_BSTRIDE + ix];
                    }
                }
            } else {
                uint16_t* dst_16 = dst16 + (bi << (bsizex + bsizey));
                for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) {
                    memcpy(dst_16 + (iy << bsizex),
                           src_16 + iy * CDEF_BSTRIDE,
                           (uint32_t)(1 << bsizex) * sizeof(uint16_t));
                }
            }
        }
        return;
    }

    if (pli == 0) {
        if (!dirinit || !*dirinit) {
            cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
            if (dirinit) {
                *dirinit = 1;
            }
        }
    } else if (pli == 1 && xdec != ydec) {
        for (bi = 0; bi < cdef_count; bi++) {
            static const uint8_t conv422[8] = {7, 0, 2, 4, 5, 6, 6, 6};
            static const uint8_t conv440[8] = {1, 2, 2, 2, 3, 4, 6, 0};

            int32_t by  = dlist[bi].by;
            int32_t bx  = dlist[bi].bx;
            dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
        }
    }

    for (bi = 0; bi < cdef_count; bi++) {
        int32_t by = dlist[bi].by;
        int32_t bx = dlist[bi].bx;
        int32_t t  = pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]);
        int32_t k  = dstride ? (by << bsizey) * dstride + (bx << bsizex) : bi << (bsizex + bsizey);
        svt_cdef_filter_block(dst8 ? &dst8[k] : NULL,
                              dst8 ? NULL : &dst16[k],
                              dstride ? dstride : 1 << bsizex,
                              &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
                              t,
                              sec_strength,
                              pri_strength ? dir[by][bx] : 0,
                              pri_damping,
                              sec_damping,
                              bsize,
                              coeff_shift,
                              subsampling_factor);
    }
}

Coverage Report

Created: 2026-05-16 06:41

Line	Count	Source
1		/*
2		* Copyright (c) 2016, Alliance for Open Media. All rights reserved
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10		*/
11
12		#include "cdef.h"
13		#include "common_dsp_rtcd.h"
14		#include "bitstream_unit.h"
15
16	0	static INLINE int32_t sign(int32_t i) {
17	0	return i < 0 ? -1 : 1;
18	0	}
19
20	0	static INLINE int32_t constrain(int32_t diff, int32_t threshold, int32_t damping) {
21	0	if (!threshold) {
22	0	return 0;
23	0	}
24
25	0	const int32_t shift = AOMMAX(0, damping - get_msb(threshold));
26	0	return sign(diff) * AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
27	0	}
28
29		/*
30		This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
31		beginning and end of the table. The cdef direction range is [0, 7] and the
32		first index is offset +/-2. This removes the need to constrain the first
33		index to the same range using e.g., & 7.
34		*/
35		DECLARE_ALIGNED(16, const int, eb_cdef_directions_padded[12][2]) = {
36		/* Padding: svt_aom_eb_cdef_directions[6] */
37		{1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0},
38		/* Padding: svt_aom_eb_cdef_directions[7] */
39		{1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1},
40
41		/* Begin svt_aom_eb_cdef_directions */
42		{-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2},
43		{0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2},
44		{0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2},
45		{0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2},
46		{1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2},
47		{1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1},
48		{1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0},
49		{1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1},
50		/* End svt_aom_eb_cdef_directions */
51
52		/* Padding: svt_aom_eb_cdef_directions[0] */
53		{-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2},
54		/* Padding: svt_aom_eb_cdef_directions[1] */
55		{0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2},
56		};
57
58		const int (*const svt_aom_eb_cdef_directions)[2] = eb_cdef_directions_padded + 2;
59
60		/* Compute the primary filter strength for an 8x8 block based on the
61		directional variance difference. A high variance difference means
62		that we have a highly directional pattern (e.g. a high contrast
63		edge), so we can apply more deringing. A low variance means that we
64		either have a low contrast edge, or a non-directional texture, so
65		we want to be careful not to blur. */
66	0	static INLINE int32_t adjust_strength(int32_t strength, int32_t var) {
67	0	const int32_t i = (var >> 6) ? AOMMIN(get_msb(var >> 6), 12) : 0;
68		/* We use the variance of 8x8 blocks to adjust the strength. */
69	0	return var ? (strength * (4 + i) + 8) >> 4 : 0;
70	0	}
71
72		void svt_aom_copy_rect8_8bit_to_16bit_c(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t sstride, int32_t v,
73	0	int32_t h) {
74	0	for (int32_t i = 0; i < v; i++) {
75	0	for (int32_t j = 0; j < h; j++) {
76	0	dst[i * dstride + j] = src[i * sstride + j];
77	0	}
78	0	}
79	0	}
80
81		/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
82		The search minimizes the weighted variance along all the lines in a
83		particular direction, i.e. the squared error between the input and a
84		"predicted" block where each pixel is replaced by the average along a line
85		in a particular direction. Since each direction have the same sum(x^2) term,
86		that term is never computed. See Section 2, step 2, of:
87		http://jmvalin.ca/notes/intra_paint.pdf */
88	0	uint8_t svt_aom_cdef_find_dir_c(const uint16_t* img, int32_t stride, int32_t* var, int32_t coeff_shift) {
89	0	int32_t cost[8] = {0};
90	0	int32_t partial[8][15] = {{0}};
91	0	int32_t best_cost = 0;
92	0	uint8_t i;
93	0	uint8_t best_dir = 0;
94		/* Instead of dividing by n between 2 and 8, we multiply by 357*8/n.
95		The output is then 840 times larger, but we don't care for finding
96		the max. */
97	0	static const int32_t div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
98	0	for (i = 0; i < 8; i++) {
99	0	int32_t j;
100	0	for (j = 0; j < 8; j++) {
101	0	int32_t x;
102		/* We subtract 128 here to reduce the maximum range of the squared
103		partial sums. */
104	0	x = (img[i * stride + j] >> coeff_shift) - 128;
105	0	partial[0][i + j] += x;
106	0	partial[1][i + j / 2] += x;
107	0	partial[2][i] += x;
108	0	partial[3][3 + i - j / 2] += x;
109	0	partial[4][7 + i - j] += x;
110	0	partial[5][3 - i / 2 + j] += x;
111	0	partial[6][j] += x;
112	0	partial[7][i / 2 + j] += x;
113	0	}
114	0	}
115	0	for (i = 0; i < 8; i++) {
116	0	cost[2] += partial[2][i] * partial[2][i];
117	0	cost[6] += partial[6][i] * partial[6][i];
118	0	}
119	0	cost[2] *= div_table[8];
120	0	cost[6] *= div_table[8];
121	0	for (i = 0; i < 7; i++) {
122	0	cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1];
123	0	cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1];
124	0	}
125	0	cost[0] += partial[0][7] * partial[0][7] * div_table[8];
126	0	cost[4] += partial[4][7] * partial[4][7] * div_table[8];
127	0	for (i = 1; i < 8; i += 2) {
128	0	int32_t j;
129	0	for (j = 0; j < 4 + 1; j++) {
130	0	cost[i] += partial[i][3 + j] * partial[i][3 + j];
131	0	}
132	0	cost[i] *= div_table[8];
133	0	for (j = 0; j < 4 - 1; j++) {
134	0	cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2];
135	0	}
136	0	}
137	0	for (i = 0; i < 8; i++) {
138	0	if (cost[i] > best_cost) {
139	0	best_cost = cost[i];
140	0	best_dir = i;
141	0	}
142	0	}
143		/* Difference between the optimal variance and the variance along the
144		orthogonal direction. Again, the sum(x^2) terms cancel out. */
145	0	*var = best_cost - cost[(best_dir + 4) & 7];
146		/* We'd normally divide by 840, but dividing by 1024 is close enough
147		for what we're going to do with this. */
148	0	*var >>= 10;
149	0	return best_dir;
150	0	}
151
152		void svt_aom_cdef_find_dir_dual_c(const uint16_t* img1, const uint16_t* img2, int stride, int32_t* var1, int32_t* var2,
153	0	int32_t coeff_shift, uint8_t* out1, uint8_t* out2) {
154	0	*out1 = svt_aom_cdef_find_dir_c(img1, stride, var1, coeff_shift);
155	0	*out2 = svt_aom_cdef_find_dir_c(img2, stride, var2, coeff_shift);
156	0	}
157
158		static AOM_INLINE void cdef_find_dir(uint16_t* in, CdefList* dlist, int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS],
159	0	int32_t cdef_count, int32_t coeff_shift, uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
160	0	int bi;
161
162		// Find direction of two 8x8 blocks together.
163	0	for (bi = 0; bi < cdef_count - 1; bi += 2) {
164	0	const uint8_t by = dlist[bi].by;
165	0	const uint8_t bx = dlist[bi].bx;
166	0	const uint8_t by2 = dlist[bi + 1].by;
167	0	const uint8_t bx2 = dlist[bi + 1].bx;
168	0	const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
169	0	const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
170	0	svt_aom_cdef_find_dir_dual(&in[pos1],
171	0	&in[pos2],
172	0	CDEF_BSTRIDE,
173	0	&var[by][bx],
174	0	&var[by2][bx2],
175	0	coeff_shift,
176	0	&dir[by][bx],
177	0	&dir[by2][bx2]);
178	0	}
179
180		// Process remaining 8x8 blocks here. One 8x8 at a time.
181	0	if (cdef_count % 2) {
182	0	const uint8_t by = dlist[bi].by;
183	0	const uint8_t bx = dlist[bi].bx;
184	0	dir[by][bx] = svt_aom_cdef_find_dir(
185	0	&in[8 * by * CDEF_BSTRIDE + 8 * bx], CDEF_BSTRIDE, &var[by][bx], coeff_shift);
186	0	}
187	0	}
188
189		const int32_t svt_aom_eb_cdef_pri_taps[2][2] = {{4, 2}, {3, 3}};
190		const int32_t svt_aom_eb_cdef_sec_taps[2][2] = {{2, 1}, {2, 1}};
191
192		/* Smooth in the direction detected. */
193		void svt_cdef_filter_block_c(uint8_t* dst8, uint16_t* dst16, int32_t dstride, const uint16_t* in, int32_t pri_strength,
194		int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping, int32_t bsize,
195	0	int32_t coeff_shift, uint8_t subsampling_factor) {
196	0	int32_t i, j, k;
197	0	const int32_t s = CDEF_BSTRIDE;
198	0	const int32_t* pri_taps = svt_aom_eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
199	0	const int32_t* sec_taps = svt_aom_eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
200
201	0	for (i = 0; i < (4 << (int32_t)(bsize == BLOCK_8X8 \|\| bsize == BLOCK_4X8)); i += subsampling_factor) {
202	0	for (j = 0; j < (4 << (int32_t)(bsize == BLOCK_8X8 \|\| bsize == BLOCK_8X4)); j++) {
203	0	int16_t sum = 0;
204	0	int16_t y;
205	0	int16_t x = in[i * s + j];
206	0	int32_t max = x;
207	0	int32_t min = x;
208	0	for (k = 0; k < 2; k++) {
209	0	int16_t p0 = in[i * s + j + svt_aom_eb_cdef_directions[dir][k]];
210	0	int16_t p1 = in[i * s + j - svt_aom_eb_cdef_directions[dir][k]];
211	0	sum += (int16_t)(pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping));
212	0	sum += (int16_t)(pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping));
213	0	if (p0 != CDEF_VERY_LARGE) {
214	0	max = AOMMAX(p0, max);
215	0	}
216	0	if (p1 != CDEF_VERY_LARGE) {
217	0	max = AOMMAX(p1, max);
218	0	}
219	0	min = AOMMIN(p0, min);
220	0	min = AOMMIN(p1, min);
221	0	int16_t s0 = in[i * s + j + svt_aom_eb_cdef_directions[(dir + 2)][k]];
222	0	int16_t s1 = in[i * s + j - svt_aom_eb_cdef_directions[(dir + 2)][k]];
223	0	int16_t s2 = in[i * s + j + svt_aom_eb_cdef_directions[(dir - 2)][k]];
224	0	int16_t s3 = in[i * s + j - svt_aom_eb_cdef_directions[(dir - 2)][k]];
225	0	if (s0 != CDEF_VERY_LARGE) {
226	0	max = AOMMAX(s0, max);
227	0	}
228	0	if (s1 != CDEF_VERY_LARGE) {
229	0	max = AOMMAX(s1, max);
230	0	}
231	0	if (s2 != CDEF_VERY_LARGE) {
232	0	max = AOMMAX(s2, max);
233	0	}
234	0	if (s3 != CDEF_VERY_LARGE) {
235	0	max = AOMMAX(s3, max);
236	0	}
237	0	min = AOMMIN(s0, min);
238	0	min = AOMMIN(s1, min);
239	0	min = AOMMIN(s2, min);
240	0	min = AOMMIN(s3, min);
241	0	sum += (int16_t)(sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping));
242	0	sum += (int16_t)(sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping));
243	0	sum += (int16_t)(sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping));
244	0	sum += (int16_t)(sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping));
245	0	}
246	0	y = (int16_t)clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
247	0	if (dst8) {
248	0	dst8[i * dstride + j] = (uint8_t)y;
249	0	} else {
250	0	dst16[i * dstride + j] = (uint16_t)y;
251	0	}
252	0	}
253	0	}
254	0	}
255
256		void svt_aom_copy_sb8_16(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t src_voffset, int32_t src_hoffset,
257	0	int32_t sstride, int32_t vsize, int32_t hsize, bool is_16bit) {
258	0	if (is_16bit) {
259	0	const uint16_t* base = ((uint16_t)src) + (src_voffset sstride + src_hoffset);
260	0	for (int r = 0; r < vsize; r++) {
261	0	svt_memcpy(dst, base, 2 * hsize);
262	0	dst += dstride;
263	0	base += sstride;
264	0	}
265	0	} else {
266	0	const uint8_t* base = &src[src_voffset * sstride + src_hoffset];
267	0	svt_aom_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
268	0	}
269	0	}
270
271		/*
272		* Loop over the non-skip 8x8 blocks. For each block, find the CDEF direction, then apply the specified filter.
273		*/
274		void svt_cdef_filter_fb(uint8_t* dst8, uint16_t* dst16, int32_t dstride, uint16_t* in, int32_t xdec, int32_t ydec,
275		uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t* dirinit,
276		int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t pli, CdefList* dlist, int32_t cdef_count,
277		int32_t level, int32_t sec_strength, int32_t pri_damping, int32_t sec_damping,
278	0	int32_t coeff_shift, uint8_t subsampling_factor) {
279	0	int32_t bi;
280	0	int32_t pri_strength = level << coeff_shift;
281	0	sec_strength <<= coeff_shift;
282	0	sec_damping += coeff_shift - (pli != PLANE_Y);
283	0	pri_damping += coeff_shift - (pli != PLANE_Y);
284
285	0	int32_t bsize = ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
286	0	int32_t bsizex = 3 - xdec;
287	0	int32_t bsizey = 3 - ydec;
288
289	0	if (!dstride && pri_strength == 0 && sec_strength == 0) {
290		// If we're here, both primary and secondary strengths are 0, and
291		// we still haven't written anything to y[] yet, so we just copy
292		// the input to y[]. This is necessary only for svt_av1_cdef_search()
293		// and only svt_av1_cdef_search() sets dirinit.
294	0	for (bi = 0; bi < cdef_count; bi++) {
295	0	int32_t by = dlist[bi].by << bsizey;
296	0	int32_t bx = dlist[bi].bx << bsizex;
297	0	int32_t iy;
298	0	uint16_t* src_16 = in + (by * CDEF_BSTRIDE + bx);
299	0	if (dst8) {
300	0	uint8_t* dst_8 = dst8 + (bi << (bsizex + bsizey));
301		//size 2x2 and 3x3, no gain to use SIMD
302	0	for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) {
303	0	for (int32_t ix = 0; ix < 1 << bsizex; ix++) {
304	0	dst_8[(iy << bsizex) + ix] = (uint8_t)src_16[iy * CDEF_BSTRIDE + ix];
305	0	}
306	0	}
307	0	} else {
308	0	uint16_t* dst_16 = dst16 + (bi << (bsizex + bsizey));
309	0	for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) {
310	0	memcpy(dst_16 + (iy << bsizex),
311	0	src_16 + iy * CDEF_BSTRIDE,
312	0	(uint32_t)(1 << bsizex) * sizeof(uint16_t));
313	0	}
314	0	}
315	0	}
316	0	return;
317	0	}
318
319	0	if (pli == 0) {
320	0	if (!dirinit \|\| !*dirinit) {
321	0	cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
322	0	if (dirinit) {
323	0	*dirinit = 1;
324	0	}
325	0	}
326	0	} else if (pli == 1 && xdec != ydec) {
327	0	for (bi = 0; bi < cdef_count; bi++) {
328	0	static const uint8_t conv422[8] = {7, 0, 2, 4, 5, 6, 6, 6};
329	0	static const uint8_t conv440[8] = {1, 2, 2, 2, 3, 4, 6, 0};
330
331	0	int32_t by = dlist[bi].by;
332	0	int32_t bx = dlist[bi].bx;
333	0	dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
334	0	}
335	0	}
336
337	0	for (bi = 0; bi < cdef_count; bi++) {
338	0	int32_t by = dlist[bi].by;
339	0	int32_t bx = dlist[bi].bx;
340	0	int32_t t = pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]);
341	0	int32_t k = dstride ? (by << bsizey) * dstride + (bx << bsizex) : bi << (bsizex + bsizey);
342	0	svt_cdef_filter_block(dst8 ? &dst8[k] : NULL,
343	0	dst8 ? NULL : &dst16[k],
344	0	dstride ? dstride : 1 << bsizex,
345	0	&in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
346	0	t,
347	0	sec_strength,
348	0	pri_strength ? dir[by][bx] : 0,
349	0	pri_damping,
350	0	sec_damping,
351	0	bsize,
352	0	coeff_shift,
353	0	subsampling_factor);
354	0	}
355	0	}