/src/vvdec/source/Lib/FilmGrain/FilmGrainImpl.cpp

Source
/* -----------------------------------------------------------------------------
The copyright in this software is being made available under the Clear BSD
License, included below. No patent rights, trademark rights and/or
other Intellectual Property Rights other than the copyrights concerning
the Software are granted under this license.

The Clear BSD License

Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted (subject to the limitations in the disclaimer below) provided that
the following conditions are met:

     * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.

     * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

     * Neither the name of the copyright holder nor the names of its
     contributors may be used to endorse or promote products derived from this
     software without specific prior written permission.

NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.


------------------------------------------------------------------------------------------- */

/* This file is based on VFGS, available on
 * https://github.com/InterDigitalInc/VersatileFilmGrain
 *
 * VFGS implements film grain synthesis as a hardware model: it simulates the
 * output of a cost-effective hardware implementation in a video display
 * pipeline. Also, the C code is split into "fw" (firmware) and "hw" (hardware)
 * parts, and as self-explanatory as possible. See VFGS github repository for
 * more details.
 *
 * The VFGS github repository also contains other tools to experiment with film
 * grain synthesis (e.g. a graphical display and tuning tool for FGC SEI
 * message).
 */

#include "FilmGrainImpl.h"

#include <cstring>   // memcpy
#include <algorithm>

#include <CommonDef.h>

namespace vvdec
{

/** Derive Y x/y offsets from (random) number
 *
 * Bit fields are designed to minimize overlaps across color channels, to
 * decorrelate them as much as possible.
 *
 * 10-bit for 12 or 13 bins makes a reasonably uniform distribution (1.2%
 * probability error).
 *
 * If 8-bit is requested to further simplify the multiplier, at the cost of less
 * uniform probability, the following bitfields can be considered:
 *
 * Y: sign = rnd[31], x = (rnd[7:0]*13 >> 8)*4,   y = (rnd[21:14]*12 >> 8)*4
 * U: sign = rnd[0],  x = (rnd[17:10]*13 >> 8)*2, y = (rnd[31:24]*12 >> 8)*2
 * V: sign = rnd[13], x = (rnd[27:20]*13 >> 8)*2, y = (rnd[11:4]*12 >> 8)*2
 *
 * Note: to fully support cross-component correlation within patterns, we would
 * need to align luma/chroma offsets.
 */
void FilmGrainImpl::get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y )
{
  uint32_t bf;   // bit field

  *s = ( ( val >> 31 ) & 1 ) ? -1 : 1;

  bf = ( val >> 0 ) & 0x3ff;
  *x = ( ( bf * 13 ) >> 10 ) * 4;   // 13 = 8 + 4 + 1 (two adders)

  bf = ( val >> 14 ) & 0x3ff;
  *y = ( ( bf * 12 ) >> 10 ) * 4;   // 12 = 8 + 4 (one adder)
                                    // Note: could shift 9 and * 2, to make a multiple of 2 and make use of all
                                    // pattern samples (when using overlap).
}

void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
{
  uint32_t bf;   // bit field

  *s = ( ( val >> 2 ) & 1 ) ? -1 : 1;

  bf = ( val >> 10 ) & 0x3ff;
  *x = ( ( bf * 13 ) >> 10 ) * ( 4 / csubx );

  bf = ( ( val >> 24 ) & 0x0ff ) | ( ( val << 8 ) & 0x300 );
  *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby );
}

void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
{
  uint32_t bf;   // bit field

  *s = ( ( val >> 15 ) & 1 ) ? -1 : 1;

  bf = ( val >> 20 ) & 0x3ff;
  *x = ( ( bf * 13 ) >> 10 ) * ( 4 / csubx );

  bf = ( val >> 4 ) & 0x3ff;
  *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby );
}

void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const
{
  const int subx = c ? csubx : 1;
  const int suby = c ? csuby : 1;

  if( ( y & 1 ) && suby > 1 )
  {
    return;
  }

  CHECK( x & 15, "x not a multiple of 16" );
  CHECK( width <= 128, "wrong width" );
  CHECK( bs != 0 && bs != 2, "wrong bs" );
  CHECK( scale_shift + bs < 8 || scale_shift + bs > 13, "wrong scale_shift" );

  // TODO: assert subx, suby, Y/C min/max, max pLUT values, etc

  const int j = y & 0xf;

  uint8_t oc1, oc2;                 // overlapping coefficients
  if( y > 15 && j == 0 )            // first line of overlap
  {
    oc1 = ( suby > 1 ) ? 20 : 12;   // current
    oc2 = ( suby > 1 ) ? 20 : 24;   // upper
  }
  else if( y > 15 && j == 1 )       // second line of overlap
  {
    oc1 = 24;
    oc2 = 12;
  }
  else
  {
    oc1 = oc2 = 0;
  }

  // Derive block offsets + sign
  int     s;        // random sign flip (current)
  uint8_t ox, oy;   // random offset (current)
  if( c == 0 )
  {
    get_offset_y( rnd, &s, &ox, &oy );
  }
  else if( c == 1 )
  {
    get_offset_u( rnd, &s, &ox, &oy );
  }
  else
  {
    get_offset_v( rnd, &s, &ox, &oy );
  }
  oy += j / suby;

  // Same for upper block (overlap)
  int     s_up;           // random sign flip (upper row)
  uint8_t ox_up, oy_up;   // random offset (upper row)
  if( c == 0 )
  {
    get_offset_y( rnd_up, &s_up, &ox_up, &oy_up );
  }
  else if( c == 1 )
  {
    get_offset_u( rnd_up, &s_up, &ox_up, &oy_up );
  }
  else
  {
    get_offset_v( rnd_up, &s_up, &ox_up, &oy_up );
  }
  oy_up += ( 16 + j ) / suby;

  // Make grain pattern
  make_grain_pattern( I, c, x, subx, oc1, oc2, ox, ox_up, oy, oy_up, s, s_up, grain, scale );

  // Scale & output
  scale_and_output( I, c, x, subx, width, grain, scale );
}

void FilmGrainImpl::make_grain_pattern( const void* I,
                                        int         c,
                                        int         x,
                                        int         subx,
                                        uint8_t     oc1,
                                        uint8_t     oc2,
                                        uint8_t     ox,
                                        uint8_t     ox_up,
                                        uint8_t     oy,
                                        uint8_t     oy_up,
                                        int         s,
                                        int         s_up,
                                        int16_t     grain[3][32],
                                        uint8_t     scale[3][32] ) const
{
  const uint8_t*  I8  = (const uint8_t*) I;
  const uint16_t* I16 = (const uint16_t*) I;
  {
    for( int i = 0; i < 16 / subx; i++ )
    {
      uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
      uint8_t pi        = pLUT[c][intensity] >> 4;                  // pattern index (integer part)
      int     P         = pattern[c ? 1 : 0][pi][oy][ox + i] * s;   // Pattern sample (from current pattern index)
                                                                    // We could consider just XORing the sign bit
#if PATTERN_INTERPOLATION
      uint8_t pf = pLUT[c][intensity] & 15;           // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
      int     Pn =
        pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s;   // Next-pattern sample (from pattern index+1)
                                                      // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
#endif

      if( oc1 )   // overlap
      {
        P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
#if PATTERN_INTERPOLATION
        Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
#endif
      }
#if PATTERN_INTERPOLATION
      // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
      grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
#else
      grain[c][16 / subx + i] = P;
#endif
      // Scale sign already integrated above because of overlap
      scale[c][16 / subx + i] = sLUT[c][intensity];
    }
  }
}

void FilmGrainImpl::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const
{
  uint8_t*  I8  = (uint8_t*) I;
  uint16_t* I16 = (uint16_t*) I;

  const uint8_t I_min = c ? C_min : Y_min;
  const uint8_t I_max = c ? C_max : Y_max;

  int flush = 0;
  do
  {
    if( x > 0 )
    {
      if( !flush )
      {
        // Horizontal deblock (across previous block)
        int16_t l1, l0, r0, r1;

        l1 = grain[c][16 / subx - 2];
        l0 = grain[c][16 / subx - 1];
        r0 = grain[c][16 / subx + 0];
        r1 = grain[c][16 / subx + 1];

        grain[c][16 / subx - 1] = round( l1 + 3 * l0 + r0, 2 );
        grain[c][16 / subx + 0] = round( l0 + 3 * r0 + r1, 2 );
      }
      {
        for( int i = 0; i < 16 / subx; i++ )
        {
          // Output previous block (or flush current)
          int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
          if( bs )
          {
            I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
          }
          else
          {
            I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
          }
        }
      }
    }

    // Shift pipeline
    if( !flush )
    {
      if( c == 0 )
      {
        for( int i = 0; i < 16; i++ )
        {
          grain[0][i] = grain[0][i + 16];
          scale[0][i] = scale[0][i + 16];
        }
      }
      else
      {
        for( int i = 0; i < 8; i++ )
        {
          grain[c][i] = grain[c][i + 8];
          scale[c][i] = scale[c][i + 8];
        }
      }
    }

    if( x + 16 >= width )
    {
      flush++;
      x += 16;
    }
  } while( flush == 1 );
}

/* Public interface ***********************************************************/

void FilmGrainImpl::set_luma_pattern( int index, int8_t* P )
{
  CHECK( index < 0 || index >= 8, "luma pattern index out of bounds" );
  memcpy( pattern[0][index], P, 64 * 64 );
}

void FilmGrainImpl::set_chroma_pattern( int index, int8_t* P )
{
  CHECK( index < 0 || index >= 8, "chroma pattern index out of bounds" );
  for( int i = 0; i < 64 / csuby; i++ )
  {
    memcpy( pattern[1][index][i], P + ( 64 / csuby ) * i, 64 / csubx );
  }
}

void FilmGrainImpl::set_scale_lut( int c, uint8_t lut[] )
{
  CHECK( c < 0 || c >= 3, "scale lut idx out of bounds" );
  memcpy( sLUT[c], lut, 256 );
}

void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[], bool all0 )
{
  CHECK( c < 0 || c >= 3, "pattern lut idx out of bounds" );
  allZero[c] = all0;
  memcpy( pLUT[c], lut, 256 );
}

void FilmGrainImpl::set_scale_shift( int shift )
{
  CHECK( shift < 2 || shift >= 8, "scale shift out of range" );
  scale_shift = shift + 6 - bs;
}

void FilmGrainImpl::set_depth( int depth )
{
  CHECK( depth != 8 && depth != 10, "only bit depth 8 and 10 supported." )

  if( bs == 0 && depth > 8 )
  {
    scale_shift -= 2;
  }
  if( bs == 2 && depth == 8 )
  {
    scale_shift += 2;
  }

  bs = depth - 8;
}

void FilmGrainImpl::set_chroma_subsampling( int subx, int suby )
{
  CHECK( subx != 1 && subx != 2, "chroma subsampling should be 1 or 2" );
  CHECK( suby != 1 && suby != 2, "chroma subsampling should be 1 or 2" );
  csubx = subx;
  csuby = suby;
}

FilmGrainImpl::FilmGrainImpl()
{
  memset( pattern, 0, sizeof( pattern ) );
  memset( sLUT,    0, sizeof( sLUT ) );
  memset( pLUT,    0, sizeof( pLUT ) );
}

}   // namespace vvdec

Coverage Report

Created: 2026-04-01 07:49

Line	Count	Source
1		/* -----------------------------------------------------------------------------
2		The copyright in this software is being made available under the Clear BSD
3		License, included below. No patent rights, trademark rights and/or
4		other Intellectual Property Rights other than the copyrights concerning
5		the Software are granted under this license.
6
7		The Clear BSD License
8
9		Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10		All rights reserved.
11
12		Redistribution and use in source and binary forms, with or without modification,
13		are permitted (subject to the limitations in the disclaimer below) provided that
14		the following conditions are met:
15
16		* Redistributions of source code must retain the above copyright notice,
17		this list of conditions and the following disclaimer.
18
19		* Redistributions in binary form must reproduce the above copyright
20		notice, this list of conditions and the following disclaimer in the
21		documentation and/or other materials provided with the distribution.
22
23		* Neither the name of the copyright holder nor the names of its
24		contributors may be used to endorse or promote products derived from this
25		software without specific prior written permission.
26
27		NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28		THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29		CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30		LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31		PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32		CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33		EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34		PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35		BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36		IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37		ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38		POSSIBILITY OF SUCH DAMAGE.
39
40
41		------------------------------------------------------------------------------------------- */
42
43		/* This file is based on VFGS, available on
44		* https://github.com/InterDigitalInc/VersatileFilmGrain
45		*
46		* VFGS implements film grain synthesis as a hardware model: it simulates the
47		* output of a cost-effective hardware implementation in a video display
48		* pipeline. Also, the C code is split into "fw" (firmware) and "hw" (hardware)
49		* parts, and as self-explanatory as possible. See VFGS github repository for
50		* more details.
51		*
52		* The VFGS github repository also contains other tools to experiment with film
53		* grain synthesis (e.g. a graphical display and tuning tool for FGC SEI
54		* message).
55		*/
56
57		#include "FilmGrainImpl.h"
58
59		#include <cstring> // memcpy
60		#include <algorithm>
61
62		#include <CommonDef.h>
63
64		namespace vvdec
65		{
66
67		/** Derive Y x/y offsets from (random) number
68		*
69		* Bit fields are designed to minimize overlaps across color channels, to
70		* decorrelate them as much as possible.
71		*
72		* 10-bit for 12 or 13 bins makes a reasonably uniform distribution (1.2%
73		* probability error).
74		*
75		* If 8-bit is requested to further simplify the multiplier, at the cost of less
76		* uniform probability, the following bitfields can be considered:
77		*
78		* Y: sign = rnd[31], x = (rnd[7:0]13 >> 8)4, y = (rnd[21:14]12 >> 8)4
79		* U: sign = rnd[0], x = (rnd[17:10]13 >> 8)2, y = (rnd[31:24]12 >> 8)2
80		* V: sign = rnd[13], x = (rnd[27:20]13 >> 8)2, y = (rnd[11:4]12 >> 8)2
81		*
82		* Note: to fully support cross-component correlation within patterns, we would
83		* need to align luma/chroma offsets.
84		*/
85		void FilmGrainImpl::get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y )
86	0	{
87	0	uint32_t bf; // bit field
88
89	0	*s = ( ( val >> 31 ) & 1 ) ? -1 : 1;
90
91	0	bf = ( val >> 0 ) & 0x3ff;
92	0	x = ( ( bf 13 ) >> 10 ) * 4; // 13 = 8 + 4 + 1 (two adders)
93
94	0	bf = ( val >> 14 ) & 0x3ff;
95	0	y = ( ( bf 12 ) >> 10 ) * 4; // 12 = 8 + 4 (one adder)
96		// Note: could shift 9 and * 2, to make a multiple of 2 and make use of all
97		// pattern samples (when using overlap).
98	0	}
99
100		void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
101	0	{
102	0	uint32_t bf; // bit field
103
104	0	*s = ( ( val >> 2 ) & 1 ) ? -1 : 1;
105
106	0	bf = ( val >> 10 ) & 0x3ff;
107	0	x = ( ( bf 13 ) >> 10 ) * ( 4 / csubx );
108
109	0	bf = ( ( val >> 24 ) & 0x0ff ) \| ( ( val << 8 ) & 0x300 );
110	0	y = ( ( bf 12 ) >> 10 ) * ( 4 / csuby );
111	0	}
112
113		void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
114	0	{
115	0	uint32_t bf; // bit field
116
117	0	*s = ( ( val >> 15 ) & 1 ) ? -1 : 1;
118
119	0	bf = ( val >> 20 ) & 0x3ff;
120	0	x = ( ( bf 13 ) >> 10 ) * ( 4 / csubx );
121
122	0	bf = ( val >> 4 ) & 0x3ff;
123	0	y = ( ( bf 12 ) >> 10 ) * ( 4 / csuby );
124	0	}
125
126		void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const
127	0	{
128	0	const int subx = c ? csubx : 1;
129	0	const int suby = c ? csuby : 1;
130
131	0	if( ( y & 1 ) && suby > 1 )
132	0	{
133	0	return;
134	0	}
135
136	0	CHECK( x & 15, "x not a multiple of 16" );
137	0	CHECK( width <= 128, "wrong width" );
138	0	CHECK( bs != 0 && bs != 2, "wrong bs" );
139	0	CHECK( scale_shift + bs < 8 \|\| scale_shift + bs > 13, "wrong scale_shift" );
140
141		// TODO: assert subx, suby, Y/C min/max, max pLUT values, etc
142
143	0	const int j = y & 0xf;
144
145	0	uint8_t oc1, oc2; // overlapping coefficients
146	0	if( y > 15 && j == 0 ) // first line of overlap
147	0	{
148	0	oc1 = ( suby > 1 ) ? 20 : 12; // current
149	0	oc2 = ( suby > 1 ) ? 20 : 24; // upper
150	0	}
151	0	else if( y > 15 && j == 1 ) // second line of overlap
152	0	{
153	0	oc1 = 24;
154	0	oc2 = 12;
155	0	}
156	0	else
157	0	{
158	0	oc1 = oc2 = 0;
159	0	}
160
161		// Derive block offsets + sign
162	0	int s; // random sign flip (current)
163	0	uint8_t ox, oy; // random offset (current)
164	0	if( c == 0 )
165	0	{
166	0	get_offset_y( rnd, &s, &ox, &oy );
167	0	}
168	0	else if( c == 1 )
169	0	{
170	0	get_offset_u( rnd, &s, &ox, &oy );
171	0	}
172	0	else
173	0	{
174	0	get_offset_v( rnd, &s, &ox, &oy );
175	0	}
176	0	oy += j / suby;
177
178		// Same for upper block (overlap)
179	0	int s_up; // random sign flip (upper row)
180	0	uint8_t ox_up, oy_up; // random offset (upper row)
181	0	if( c == 0 )
182	0	{
183	0	get_offset_y( rnd_up, &s_up, &ox_up, &oy_up );
184	0	}
185	0	else if( c == 1 )
186	0	{
187	0	get_offset_u( rnd_up, &s_up, &ox_up, &oy_up );
188	0	}
189	0	else
190	0	{
191	0	get_offset_v( rnd_up, &s_up, &ox_up, &oy_up );
192	0	}
193	0	oy_up += ( 16 + j ) / suby;
194
195		// Make grain pattern
196	0	make_grain_pattern( I, c, x, subx, oc1, oc2, ox, ox_up, oy, oy_up, s, s_up, grain, scale );
197
198		// Scale & output
199	0	scale_and_output( I, c, x, subx, width, grain, scale );
200	0	}
201
202		void FilmGrainImpl::make_grain_pattern( const void* I,
203		int c,
204		int x,
205		int subx,
206		uint8_t oc1,
207		uint8_t oc2,
208		uint8_t ox,
209		uint8_t ox_up,
210		uint8_t oy,
211		uint8_t oy_up,
212		int s,
213		int s_up,
214		int16_t grain[3][32],
215		uint8_t scale[3][32] ) const
216	0	{
217	0	const uint8_t* I8 = (const uint8_t*) I;
218	0	const uint16_t* I16 = (const uint16_t*) I;
219	0	{
220	0	for( int i = 0; i < 16 / subx; i++ )
221	0	{
222	0	uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
223	0	uint8_t pi = pLUT[c][intensity] >> 4; // pattern index (integer part)
224	0	int P = pattern[c ? 1 : 0][pi][oy][ox + i] * s; // Pattern sample (from current pattern index)
225		// We could consider just XORing the sign bit
226		#if PATTERN_INTERPOLATION
227		uint8_t pf = pLUT[c][intensity] & 15; // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
228		int Pn =
229		pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s; // Next-pattern sample (from pattern index+1)
230		// But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
231		#endif
232
233	0	if( oc1 ) // overlap
234	0	{
235	0	P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
236		#if PATTERN_INTERPOLATION
237		Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
238		#endif
239	0	}
240		#if PATTERN_INTERPOLATION
241		// Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
242		grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
243		#else
244	0	grain[c][16 / subx + i] = P;
245	0	#endif
246		// Scale sign already integrated above because of overlap
247	0	scale[c][16 / subx + i] = sLUT[c][intensity];
248	0	}
249	0	}
250	0	}
251
252		void FilmGrainImpl::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const
253	0	{
254	0	uint8_t* I8 = (uint8_t*) I;
255	0	uint16_t* I16 = (uint16_t*) I;
256
257	0	const uint8_t I_min = c ? C_min : Y_min;
258	0	const uint8_t I_max = c ? C_max : Y_max;
259
260	0	int flush = 0;
261	0	do
262	0	{
263	0	if( x > 0 )
264	0	{
265	0	if( !flush )
266	0	{
267		// Horizontal deblock (across previous block)
268	0	int16_t l1, l0, r0, r1;
269
270	0	l1 = grain[c][16 / subx - 2];
271	0	l0 = grain[c][16 / subx - 1];
272	0	r0 = grain[c][16 / subx + 0];
273	0	r1 = grain[c][16 / subx + 1];
274
275	0	grain[c][16 / subx - 1] = round( l1 + 3 * l0 + r0, 2 );
276	0	grain[c][16 / subx + 0] = round( l0 + 3 * r0 + r1, 2 );
277	0	}
278	0	{
279	0	for( int i = 0; i < 16 / subx; i++ )
280	0	{
281		// Output previous block (or flush current)
282	0	int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
283	0	if( bs )
284	0	{
285	0	I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
286	0	}
287	0	else
288	0	{
289	0	I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
290	0	}
291	0	}
292	0	}
293	0	}
294
295		// Shift pipeline
296	0	if( !flush )
297	0	{
298	0	if( c == 0 )
299	0	{
300	0	for( int i = 0; i < 16; i++ )
301	0	{
302	0	grain[0][i] = grain[0][i + 16];
303	0	scale[0][i] = scale[0][i + 16];
304	0	}
305	0	}
306	0	else
307	0	{
308	0	for( int i = 0; i < 8; i++ )
309	0	{
310	0	grain[c][i] = grain[c][i + 8];
311	0	scale[c][i] = scale[c][i + 8];
312	0	}
313	0	}
314	0	}
315
316	0	if( x + 16 >= width )
317	0	{
318	0	flush++;
319	0	x += 16;
320	0	}
321	0	} while( flush == 1 );
322	0	}
323
324		/* Public interface ***********************************************************/
325
326		void FilmGrainImpl::set_luma_pattern( int index, int8_t* P )
327	0	{
328	0	CHECK( index < 0 \|\| index >= 8, "luma pattern index out of bounds" );
329	0	memcpy( pattern[0][index], P, 64 * 64 );
330	0	}
331
332		void FilmGrainImpl::set_chroma_pattern( int index, int8_t* P )
333	0	{
334	0	CHECK( index < 0 \|\| index >= 8, "chroma pattern index out of bounds" );
335	0	for( int i = 0; i < 64 / csuby; i++ )
336	0	{
337	0	memcpy( pattern[1][index][i], P + ( 64 / csuby ) * i, 64 / csubx );
338	0	}
339	0	}
340
341		void FilmGrainImpl::set_scale_lut( int c, uint8_t lut[] )
342	0	{
343	0	CHECK( c < 0 \|\| c >= 3, "scale lut idx out of bounds" );
344	0	memcpy( sLUT[c], lut, 256 );
345	0	}
346
347		void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[], bool all0 )
348	0	{
349	0	CHECK( c < 0 \|\| c >= 3, "pattern lut idx out of bounds" );
350	0	allZero[c] = all0;
351	0	memcpy( pLUT[c], lut, 256 );
352	0	}
353
354		void FilmGrainImpl::set_scale_shift( int shift )
355	0	{
356	0	CHECK( shift < 2 \|\| shift >= 8, "scale shift out of range" );
357	0	scale_shift = shift + 6 - bs;
358	0	}
359
360		void FilmGrainImpl::set_depth( int depth )
361	0	{
362	0	CHECK( depth != 8 && depth != 10, "only bit depth 8 and 10 supported." )
363
364	0	if( bs == 0 && depth > 8 )
365	0	{
366	0	scale_shift -= 2;
367	0	}
368	0	if( bs == 2 && depth == 8 )
369	0	{
370	0	scale_shift += 2;
371	0	}
372
373	0	bs = depth - 8;
374	0	}
375
376		void FilmGrainImpl::set_chroma_subsampling( int subx, int suby )
377	0	{
378	0	CHECK( subx != 1 && subx != 2, "chroma subsampling should be 1 or 2" );
379	0	CHECK( suby != 1 && suby != 2, "chroma subsampling should be 1 or 2" );
380	0	csubx = subx;
381	0	csuby = suby;
382	0	}
383
384		FilmGrainImpl::FilmGrainImpl()
385	0	{
386	0	memset( pattern, 0, sizeof( pattern ) );
387	0	memset( sLUT, 0, sizeof( sLUT ) );
388	0	memset( pLUT, 0, sizeof( pLUT ) );
389	0	}
390
391		} // namespace vvdec