/work/openh264/codec/processing/src/downsample/downsamplefuncs.cpp

Source
/*!
 * \copy
 *     Copyright (c)  2008-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *  downsample_yuv.c
 *
 *  Abstract
 *      Implementation for source yuv data downsampling used before spatial encoding.
 *
 *  History
 *      10/24/2008 Created
 *
 *****************************************************************************/

#include "downsample.h"


WELSVP_NAMESPACE_BEGIN


void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
                                  uint8_t* pSrc, const int32_t kiSrcStride,
                                  const int32_t kiSrcWidth, const int32_t kiSrcHeight)

{
  uint8_t* pDstLine     = pDst;
  uint8_t* pSrcLine     = pSrc;
  const int32_t kiSrcStridex2   = kiSrcStride << 1;
  const int32_t kiDstWidth      = kiSrcWidth  >> 1;
  const int32_t kiDstHeight     = kiSrcHeight >> 1;

  for (int32_t j = 0; j < kiDstHeight; j ++) {
    for (int32_t i = 0; i < kiDstWidth; i ++) {
      const int32_t kiSrcX = i << 1;
      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;

      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
    }
    pDstLine    += kiDstStride;
    pSrcLine    += kiSrcStridex2;
  }
}

void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
    uint8_t* pSrc, const int32_t kiSrcStride,
    const int32_t kiSrcWidth, const int32_t kiSrcHeight)

{
  uint8_t* pDstLine     = pDst;
  uint8_t* pSrcLine     = pSrc;
  const int32_t kiSrcStridex4   = kiSrcStride << 2;
  const int32_t kiDstWidth      = kiSrcWidth  >> 2;
  const int32_t kiDstHeight     = kiSrcHeight >> 2;

  for (int32_t j = 0; j < kiDstHeight; j ++) {
    for (int32_t i = 0; i < kiDstWidth; i ++) {
      const int32_t kiSrcX = i << 2;
      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;

      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
    }
    pDstLine    += kiDstStride;
    pSrcLine    += kiSrcStridex4;
  }
}

void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
    uint8_t* pSrc, const int32_t kiSrcStride,
    const int32_t kiSrcWidth, const int32_t kiDstHeight)

{
  uint8_t* pDstLine     = pDst;
  uint8_t* pSrcLine     = pSrc;
  const int32_t kiSrcStridex3   = kiSrcStride * 3;
  const int32_t kiDstWidth      = kiSrcWidth / 3;

  for (int32_t j = 0; j < kiDstHeight; j ++) {
    for (int32_t i = 0; i < kiDstWidth; i ++) {
      const int32_t kiSrcX = i * 3;
      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;

      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
    }
    pDstLine    += kiDstStride;
    pSrcLine    += kiSrcStridex3;
  }
}

void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
                                       const int32_t kiDstHeight,
                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
  const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
  const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
  int32_t fScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
  int32_t fScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
  uint32_t x;
  int32_t iYInverse, iXInverse;

  uint8_t* pByDst = pDst;
  uint8_t* pByLineDst = pDst;

  iYInverse = 1 << (kuiScaleBitHeight - 1);
  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
    int32_t iYy = iYInverse >> kuiScaleBitHeight;
    int32_t fv = iYInverse & (kuiScaleHeight - 1);

    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;

    pByDst = pByLineDst;
    iXInverse = 1 << (kuiScaleBitWidth - 1);
    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
      int32_t iXx = iXInverse >> kuiScaleBitWidth;
      int32_t iFu = iXInverse & (kuiScaleWidth - 1);

      uint8_t* pByCurrent = pBySrc + iXx;
      uint8_t a, b, c, d;

      a = *pByCurrent;
      b = * (pByCurrent + 1);
      c = * (pByCurrent + kiSrcStride);
      d = * (pByCurrent + kiSrcStride + 1);

      x  = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
      x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
      x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
      x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
      x >>= (kuiScaleBitHeight - 1);
      x += 1;
      x >>= 1;
      //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
      // ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
      x = WELS_CLAMP (x, 0, 255);
      *pByDst++ = (uint8_t)x;

      iXInverse += fScalex;
    }
    *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
    pByLineDst += kiDstStride;
    iYInverse += fScaley;
  }

  // last row special
  {
    int32_t iYy = iYInverse >> kuiScaleBitHeight;
    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;

    pByDst = pByLineDst;
    iXInverse = 1 << (kuiScaleBitWidth - 1);
    for (int32_t j = 0; j < kiDstWidth; j++) {
      int32_t iXx = iXInverse >> kuiScaleBitWidth;
      *pByDst++ = * (pBySrc + iXx);

      iXInverse += fScalex;
    }
  }
}

void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
    const int32_t kiDstHeight,
    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
  const int32_t kiScaleBit = 15;
  const int32_t kiScale = (1 << kiScaleBit);
  int32_t iScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kiScale);
  int32_t iScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kiScale);
  int64_t x;
  int32_t iYInverse, iXInverse;

  uint8_t* pByDst = pDst;
  uint8_t* pByLineDst = pDst;

  iYInverse = 1 << (kiScaleBit - 1);
  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
    int32_t iYy = iYInverse >> kiScaleBit;
    int32_t iFv = iYInverse & (kiScale - 1);

    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;

    pByDst = pByLineDst;
    iXInverse = 1 << (kiScaleBit - 1);
    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
      int32_t iXx = iXInverse >> kiScaleBit;
      int32_t iFu = iXInverse & (kiScale - 1);

      uint8_t* pByCurrent = pBySrc + iXx;
      uint8_t a, b, c, d;

      a = *pByCurrent;
      b = * (pByCurrent + 1);
      c = * (pByCurrent + kiSrcStride);
      d = * (pByCurrent + kiSrcStride + 1);

      x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
             kiScale - 1 - iFu)) * iFv * c +
           ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
      x = WELS_CLAMP (x, 0, 255);
      *pByDst++ = (uint8_t)x;

      iXInverse += iScalex;
    }
    *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
    pByLineDst += kiDstStride;
    iYInverse += iScaley;
  }

  // last row special
  {
    int32_t iYy = iYInverse >> kiScaleBit;
    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;

    pByDst = pByLineDst;
    iXInverse = 1 << (kiScaleBit - 1);
    for (int32_t j = 0; j < kiDstWidth; j++) {
      int32_t iXx = iXInverse >> kiScaleBit;
      *pByDst++ = * (pBySrc + iXx);

      iXInverse += iScalex;
    }
  }
}

#if defined(X86_ASM) || defined(HAVE_NEON) || (defined(HAVE_NEON_AARCH64) && defined(__aarch64__))
static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
    const int32_t kiDstHeight,
    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
    const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
    void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
                  uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);

  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);

  func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
}

#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
  void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
      uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
    GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
        pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
  }

#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
  void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
      uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
    GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
        pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
  }
#endif

#ifdef X86_ASM
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
#ifdef HAVE_AVX2
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
#endif
#endif //X86_ASM

#ifdef HAVE_NEON
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
#endif

#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
#endif
WELSVP_NAMESPACE_END

Coverage Report

Created: 2026-05-30 06:10

Line	Count	Source
1		/*!
2		* \copy
3		* Copyright (c) 2008-2013, Cisco Systems
4		* All rights reserved.
5		*
6		* Redistribution and use in source and binary forms, with or without
7		* modification, are permitted provided that the following conditions
8		* are met:
9		*
10		* * Redistributions of source code must retain the above copyright
11		* notice, this list of conditions and the following disclaimer.
12		*
13		* * Redistributions in binary form must reproduce the above copyright
14		* notice, this list of conditions and the following disclaimer in
15		* the documentation and/or other materials provided with the
16		* distribution.
17		*
18		* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19		* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20		* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21		* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22		* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23		* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24		* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25		* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26		* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27		* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28		* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29		* POSSIBILITY OF SUCH DAMAGE.
30		*
31		* downsample_yuv.c
32		*
33		* Abstract
34		* Implementation for source yuv data downsampling used before spatial encoding.
35		*
36		* History
37		* 10/24/2008 Created
38		*
39		*****************************************************************************/
40
41		#include "downsample.h"
42
43
44		WELSVP_NAMESPACE_BEGIN
45
46
47		void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
48		uint8_t* pSrc, const int32_t kiSrcStride,
49		const int32_t kiSrcWidth, const int32_t kiSrcHeight)
50
51	0	{
52	0	uint8_t* pDstLine = pDst;
53	0	uint8_t* pSrcLine = pSrc;
54	0	const int32_t kiSrcStridex2 = kiSrcStride << 1;
55	0	const int32_t kiDstWidth = kiSrcWidth >> 1;
56	0	const int32_t kiDstHeight = kiSrcHeight >> 1;
57
58	0	for (int32_t j = 0; j < kiDstHeight; j ++) {
59	0	for (int32_t i = 0; i < kiDstWidth; i ++) {
60	0	const int32_t kiSrcX = i << 1;
61	0	const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
62	0	const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
63
64	0	pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
65	0	}
66	0	pDstLine += kiDstStride;
67	0	pSrcLine += kiSrcStridex2;
68	0	}
69	0	}
70
71		void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
72		uint8_t* pSrc, const int32_t kiSrcStride,
73		const int32_t kiSrcWidth, const int32_t kiSrcHeight)
74
75	0	{
76	0	uint8_t* pDstLine = pDst;
77	0	uint8_t* pSrcLine = pSrc;
78	0	const int32_t kiSrcStridex4 = kiSrcStride << 2;
79	0	const int32_t kiDstWidth = kiSrcWidth >> 2;
80	0	const int32_t kiDstHeight = kiSrcHeight >> 2;
81
82	0	for (int32_t j = 0; j < kiDstHeight; j ++) {
83	0	for (int32_t i = 0; i < kiDstWidth; i ++) {
84	0	const int32_t kiSrcX = i << 2;
85	0	const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
86	0	const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
87
88	0	pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
89	0	}
90	0	pDstLine += kiDstStride;
91	0	pSrcLine += kiSrcStridex4;
92	0	}
93	0	}
94
95		void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
96		uint8_t* pSrc, const int32_t kiSrcStride,
97		const int32_t kiSrcWidth, const int32_t kiDstHeight)
98
99	0	{
100	0	uint8_t* pDstLine = pDst;
101	0	uint8_t* pSrcLine = pSrc;
102	0	const int32_t kiSrcStridex3 = kiSrcStride * 3;
103	0	const int32_t kiDstWidth = kiSrcWidth / 3;
104
105	0	for (int32_t j = 0; j < kiDstHeight; j ++) {
106	0	for (int32_t i = 0; i < kiDstWidth; i ++) {
107	0	const int32_t kiSrcX = i * 3;
108	0	const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
109	0	const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
110
111	0	pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
112	0	}
113	0	pDstLine += kiDstStride;
114	0	pSrcLine += kiSrcStridex3;
115	0	}
116	0	}
117
118		void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
119		const int32_t kiDstHeight,
120	0	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
121	0	const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
122	0	const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
123	0	int32_t fScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
124	0	int32_t fScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
125	0	uint32_t x;
126	0	int32_t iYInverse, iXInverse;
127
128	0	uint8_t* pByDst = pDst;
129	0	uint8_t* pByLineDst = pDst;
130
131	0	iYInverse = 1 << (kuiScaleBitHeight - 1);
132	0	for (int32_t i = 0; i < kiDstHeight - 1; i++) {
133	0	int32_t iYy = iYInverse >> kuiScaleBitHeight;
134	0	int32_t fv = iYInverse & (kuiScaleHeight - 1);
135
136	0	uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
137
138	0	pByDst = pByLineDst;
139	0	iXInverse = 1 << (kuiScaleBitWidth - 1);
140	0	for (int32_t j = 0; j < kiDstWidth - 1; j++) {
141	0	int32_t iXx = iXInverse >> kuiScaleBitWidth;
142	0	int32_t iFu = iXInverse & (kuiScaleWidth - 1);
143
144	0	uint8_t* pByCurrent = pBySrc + iXx;
145	0	uint8_t a, b, c, d;
146
147	0	a = *pByCurrent;
148	0	b = * (pByCurrent + 1);
149	0	c = * (pByCurrent + kiSrcStride);
150	0	d = * (pByCurrent + kiSrcStride + 1);
151
152	0	x = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
153	0	x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
154	0	x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
155	0	x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
156	0	x >>= (kuiScaleBitHeight - 1);
157	0	x += 1;
158	0	x >>= 1;
159		//x = (((__int64)(SCALE_BIG - 1 - iFu))(SCALE_BIG - 1 - fv)a + ((__int64)iFu)(SCALE_BIG - 1 -fv)b + ((__int64)(SCALE_BIG - 1 -iFu))fvc +
160		// ((__int64)iFu)fvd + (1 << (2SCALE_BIT_BIG-1)) ) >> (2SCALE_BIT_BIG);
161	0	x = WELS_CLAMP (x, 0, 255);
162	0	*pByDst++ = (uint8_t)x;
163
164	0	iXInverse += fScalex;
165	0	}
166	0	pByDst = (pBySrc + (iXInverse >> kuiScaleBitWidth));
167	0	pByLineDst += kiDstStride;
168	0	iYInverse += fScaley;
169	0	}
170
171		// last row special
172	0	{
173	0	int32_t iYy = iYInverse >> kuiScaleBitHeight;
174	0	uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
175
176	0	pByDst = pByLineDst;
177	0	iXInverse = 1 << (kuiScaleBitWidth - 1);
178	0	for (int32_t j = 0; j < kiDstWidth; j++) {
179	0	int32_t iXx = iXInverse >> kuiScaleBitWidth;
180	0	pByDst++ = (pBySrc + iXx);
181
182	0	iXInverse += fScalex;
183	0	}
184	0	}
185	0	}
186
187		void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
188		const int32_t kiDstHeight,
189	0	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
190	0	const int32_t kiScaleBit = 15;
191	0	const int32_t kiScale = (1 << kiScaleBit);
192	0	int32_t iScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kiScale);
193	0	int32_t iScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kiScale);
194	0	int64_t x;
195	0	int32_t iYInverse, iXInverse;
196
197	0	uint8_t* pByDst = pDst;
198	0	uint8_t* pByLineDst = pDst;
199
200	0	iYInverse = 1 << (kiScaleBit - 1);
201	0	for (int32_t i = 0; i < kiDstHeight - 1; i++) {
202	0	int32_t iYy = iYInverse >> kiScaleBit;
203	0	int32_t iFv = iYInverse & (kiScale - 1);
204
205	0	uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
206
207	0	pByDst = pByLineDst;
208	0	iXInverse = 1 << (kiScaleBit - 1);
209	0	for (int32_t j = 0; j < kiDstWidth - 1; j++) {
210	0	int32_t iXx = iXInverse >> kiScaleBit;
211	0	int32_t iFu = iXInverse & (kiScale - 1);
212
213	0	uint8_t* pByCurrent = pBySrc + iXx;
214	0	uint8_t a, b, c, d;
215
216	0	a = *pByCurrent;
217	0	b = * (pByCurrent + 1);
218	0	c = * (pByCurrent + kiSrcStride);
219	0	d = * (pByCurrent + kiSrcStride + 1);
220
221	0	x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
222	0	kiScale - 1 - iFu)) * iFv * c +
223	0	((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
224	0	x = WELS_CLAMP (x, 0, 255);
225	0	*pByDst++ = (uint8_t)x;
226
227	0	iXInverse += iScalex;
228	0	}
229	0	pByDst = (pBySrc + (iXInverse >> kiScaleBit));
230	0	pByLineDst += kiDstStride;
231	0	iYInverse += iScaley;
232	0	}
233
234		// last row special
235	0	{
236	0	int32_t iYy = iYInverse >> kiScaleBit;
237	0	uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
238
239	0	pByDst = pByLineDst;
240	0	iXInverse = 1 << (kiScaleBit - 1);
241	0	for (int32_t j = 0; j < kiDstWidth; j++) {
242	0	int32_t iXx = iXInverse >> kiScaleBit;
243	0	pByDst++ = (pBySrc + iXx);
244
245	0	iXInverse += iScalex;
246	0	}
247	0	}
248	0	}
249
250		#if defined(X86_ASM) \|\| defined(HAVE_NEON) \|\| (defined(HAVE_NEON_AARCH64) && defined(__aarch64__))
251		static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
252		const int32_t kiDstHeight,
253		uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
254		const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
255		void (func) (uint8_t pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
256		uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
257		const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
258
259		uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
260		uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
261
262		func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
263		}
264
265		#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
266		void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
267		uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
268		uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
269		GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
270		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
271		}
272
273		#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
274		void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
275		uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
276		uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
277		GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
278		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
279		}
280		#endif
281
282		#ifdef X86_ASM
283		DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
284		DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
285		DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
286		DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
287		#ifdef HAVE_AVX2
288		DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
289		DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
290		#endif
291		#endif //X86_ASM
292
293		#ifdef HAVE_NEON
294		DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
295		#endif
296
297		#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__)
298		DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
299		#endif
300		WELSVP_NAMESPACE_END