Coverage Report

Created: 2024-11-21 06:35

/src/gfwx-fuzzers/gfwx.h
Line
Count
Source (jump to first uncovered line)
1
//  Good, Fast Wavelet Codec "GFWX" v1
2
//  ----------------------------------
3
//  December 1, 2015 [patched on Oct 20, 2017]
4
//  Author: Graham Fyffe <gfyffe@gmail.com> or <fyffe@google.com>, and Google, Inc.
5
//  Website: www.gfwx.org
6
//  Features:
7
//  - FAST
8
//  - compression ratio similar to JPEG 2000
9
//  - under 1000 lines of code, with no external libraries
10
//  - 100% lossless at max quality
11
//  - low quality looks interpolated instead of blocky
12
//  - progressive decoding with optional downsampling
13
//  - supports uint8_t, int8_t, uint16_t, int16_t
14
//  - supports 1 to 65536 interleaved channels
15
//  - supports 1 to 65536 non-interleaved layers
16
//  - optional Bayer mode to compress Bayer data more
17
//  - optional chroma downsampling, even in Bayer mode
18
//  - optional user-programmable color/channel transform
19
//  - optional slightly less fast mode to compress more
20
//  - imageData can be any class with a pointer-like interface
21
//  - thoroughly tested using several pictures of cats
22
//
23
//  GFWX is released under the 3-clause BSD license:
24
//
25
//  Copyright (c) 2015, University of Southern California. All rights reserved. Redistribution and use in source and binary forms,
26
//  with or without modification, are permitted provided that the following conditions are met:
27
//
28
//  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
29
//
30
//  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
31
//     the documentation and/or other materials provided with the distribution.
32
//
33
//  3. Neither the name of the organization nor the names of its contributors may be used to endorse or promote products derived from
34
//     this software without specific prior written permission.
35
//
36
//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
38
//  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
39
//  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40
//  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41
//  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42
43
#pragma once
44
#include <algorithm>
45
#include <cstddef>
46
#include <cstdint>
47
#include <cstdlib>
48
#include <limits>
49
#include <type_traits>
50
#include <utility>
51
#include <vector>
52
#if defined(_OPENMP) && defined(_MSC_VER)
53
#define OMP_PARALLEL_FOR(X) __pragma(omp parallel for schedule(dynamic, X))
54
#elif defined(_OPENMP)
55
#include <omp.h>
56
#define STR(X) #X
57
#define STRINGIFY(X) STR(X)
58
#define TWO_ARGUMENTS(X,Y,Z) X(Y, Z)
59
#define OMP_PARALLEL_FOR(X) _Pragma(STRINGIFY(TWO_ARGUMENTS(omp parallel for schedule, dynamic, X)))
60
#else
61
#define OMP_PARALLEL_FOR(X)
62
#endif
63
64
namespace GFWX
65
{
66
  enum
67
  {
68
    QualityMax = 1024,    // compress with QualityMax for 100% lossless, or less than QualityMax for lossy
69
    ThreadIterations = 64,  // OMP settings tuned on my machine with large images
70
    BitDepthAuto = 0, BlockDefault = 7, BlockMax = 30,
71
    FilterLinear = 0, FilterCubic = 1, QuantizationScalar = 0, EncoderTurbo = 0, EncoderFast = 1, EncoderContextual = 2,
72
    IntentGeneric = 0, IntentMono = 1, IntentBayerRGGB = 2, IntentBayerBGGR = 3, IntentBayerGRBG = 4, IntentBayerGBRG = 5, IntentBayerGeneric = 6,
73
    IntentRGB = 7, IntentRGBA = 8, IntentRGBApremult = 9, IntentBGR = 10, IntentBGRA = 11, IntentBGRApremult = 12, IntentCMYK = 13,
74
    ResultOk = 0, ErrorOverflow = -1, ErrorMalformed = -2, ErrorTypeMismatch = -3
75
  };
76
77
  struct Header // use the empty constructor to fetch headers before decompressing, and use the parameterized constructor when compressing
78
  {
79
    int sizex, sizey, layers, channels, bitDepth, quality, chromaScale, blockSize, filter, quantization, encoder, intent, version, isSigned;
80
0
    Header() {}
81
    Header(int sizex, int sizey, int layers, int channels, int bitDepth,
82
      int quality, int chromaScale, int blockSize, int filter, int quantization, int encoder, int intent)
83
3.04k
      : sizex(sizex), sizey(sizey), layers(layers), channels(channels), bitDepth(bitDepth), quality(std::max(1, std::min(int(QualityMax), quality))),
84
3.04k
      chromaScale(std::max(1, std::min(256, chromaScale))), blockSize(std::min(30, std::max(2, blockSize))), filter(std::min(255, filter)),
85
3.04k
      quantization(std::min(255, quantization)), encoder(std::min(255, encoder)), intent(std::min(255, intent)) {}
86
    size_t bufferSize() const
87
0
    {
88
0
      size_t const part1 = static_cast<size_t>(sizex) * sizey;
89
0
      size_t const part2 = static_cast<size_t>(channels) * layers * ((bitDepth + 7) / 8);
90
0
      return std::log(part1) + std::log(part2) > std::log(std::numeric_limits<size_t>::max() - 1) ? 0 : part1 * part2;
91
0
    }
92
  };
93
94
  template<typename T> struct Image // handy wrapper for 2D image data
95
  {
96
    T * data;
97
    int sizex, sizey;
98
3.51M
    Image(T * data, int sizex, int sizey) : data(data), sizex(sizex), sizey(sizey) {}
99
739M
    T * operator[] (int y) { return data + static_cast<size_t>(y) * sizex; }
100
  };
101
102
  struct Bits // handy wrapper for treating an array of unsigned ints as a bit stream
103
  {
104
    uint32_t * buffer, * bufferEnd;
105
    uint32_t writeCache;
106
    int indexBits;  // -1 indicates buffer overflow
107
16.1k
    Bits(uint32_t * buffer, uint32_t * bufferEnd) : buffer(buffer), bufferEnd(bufferEnd), writeCache(0), indexBits(0) {}
108
    uint32_t getBits(int bits)
109
0
    {
110
0
      int newBits = indexBits + bits;
111
0
      if (buffer == bufferEnd)
112
0
        return indexBits = -1;  // signify overflow
113
0
      uint32_t x = *buffer << indexBits;
114
0
      if (newBits >= 32)
115
0
      {
116
0
        ++ buffer;
117
0
        if ((newBits -= 32) > 0)
118
0
        {
119
0
          if (buffer == bufferEnd)
120
0
            return indexBits = -1;  // signify overflow
121
0
          x |= *buffer >> (32 - indexBits);
122
0
        }
123
0
      }
124
0
      indexBits = newBits;
125
0
      return x >> (32 - bits);
126
0
    }
127
    void putBits(uint32_t x, int bits)
128
58.1M
    {
129
58.1M
      int newBits = indexBits + bits;
130
58.1M
      if (buffer == bufferEnd)
131
10.7M
        newBits = -1;  // signify overflow
132
47.3M
      else if (newBits < 32)
133
36.1M
        (writeCache <<= bits) |= x;
134
11.1M
      else if (bits == 32 && newBits == 32)
135
13.8k
      {
136
13.8k
        newBits = 0;
137
13.8k
        *(buffer ++) = x;
138
13.8k
      }
139
11.1M
      else
140
11.1M
      {
141
11.1M
        newBits -= 32;
142
11.1M
        *(buffer ++) = (writeCache << (bits - newBits)) | (x >> newBits);
143
11.1M
        writeCache = x;
144
11.1M
      }
145
58.1M
      indexBits = newBits;
146
58.1M
    }
147
    uint32_t getZeros(uint32_t maxZeros)
148
0
    {
149
0
      int newBits = indexBits;
150
0
      if (buffer == bufferEnd)
151
0
        return indexBits = -1;  // signify overflow
152
0
      uint32_t b = *buffer;
153
0
      uint32_t x = 0;
154
0
      while (true)
155
0
      {
156
0
        if (newBits == 31)
157
0
        {
158
0
          ++ buffer;
159
0
          if ((b & 1u) || (++ x == maxZeros))
160
0
          {
161
0
            indexBits = 0;
162
0
            return x;
163
0
          }
164
0
          if (buffer == bufferEnd)
165
0
            return indexBits = -1;  // signify overflow
166
0
          b = *buffer;
167
0
          newBits = 0;
168
0
          continue;
169
0
        }
170
0
        if (((b << newBits) & (1u << 31)) || (++ x == maxZeros))
171
0
        {
172
0
          indexBits = newBits + 1;
173
0
          return x;
174
0
        }
175
0
        ++ newBits;
176
0
      }
177
0
    }
178
    void flushWriteWord() // [NOTE] does not clear overflow
179
3.50M
    {
180
3.50M
      putBits(0, (32 - indexBits) % 32);
181
3.50M
    }
182
    void flushReadWord()  // [NOTE] does not clear overflow
183
0
    {
184
0
      if (indexBits <= 0)
185
0
        return;
186
0
      ++ buffer;
187
0
      indexBits = 0;
188
0
    }
189
  };
190
191
  template<int pot> void unsignedCode(uint32_t x, Bits & stream)  // limited length power-of-two Golomb-Rice code
192
43.9M
  {
193
43.9M
    uint32_t const y = x >> (pot);
194
43.9M
    if (y >= 12)
195
13.2M
    {
196
13.2M
      stream.putBits(0, 12);  // escape to larger code
197
13.2M
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
13.2M
    }
199
30.6M
    else
200
30.6M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
43.9M
  }
void GFWX::unsignedCode<2>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
725k
  {
193
725k
    uint32_t const y = x >> (pot);
194
725k
    if (y >= 12)
195
68.1k
    {
196
68.1k
      stream.putBits(0, 12);  // escape to larger code
197
68.1k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
68.1k
    }
199
656k
    else
200
656k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
725k
  }
void GFWX::unsignedCode<6>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
68.1k
  {
193
68.1k
    uint32_t const y = x >> (pot);
194
68.1k
    if (y >= 12)
195
10.7k
    {
196
10.7k
      stream.putBits(0, 12);  // escape to larger code
197
10.7k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
10.7k
    }
199
57.3k
    else
200
57.3k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
68.1k
  }
void GFWX::unsignedCode<10>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
10.7k
  {
193
10.7k
    uint32_t const y = x >> (pot);
194
10.7k
    if (y >= 12)
195
888
    {
196
888
      stream.putBits(0, 12);  // escape to larger code
197
888
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
888
    }
199
9.90k
    else
200
9.90k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
10.7k
  }
void GFWX::unsignedCode<14>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
888
  {
193
888
    uint32_t const y = x >> (pot);
194
888
    if (y >= 12)
195
36
    {
196
36
      stream.putBits(0, 12);  // escape to larger code
197
36
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
36
    }
199
852
    else
200
852
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
888
  }
void GFWX::unsignedCode<18>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
36
  {
193
36
    uint32_t const y = x >> (pot);
194
36
    if (y >= 12)
195
0
    {
196
0
      stream.putBits(0, 12);  // escape to larger code
197
0
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
0
    }
199
36
    else
200
36
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
36
  }
Unexecuted instantiation: void GFWX::unsignedCode<22>(unsigned int, GFWX::Bits&)
Unexecuted instantiation: void GFWX::unsignedCode<24>(unsigned int, GFWX::Bits&)
void GFWX::unsignedCode<1>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
14.1M
  {
193
14.1M
    uint32_t const y = x >> (pot);
194
14.1M
    if (y >= 12)
195
5.34M
    {
196
5.34M
      stream.putBits(0, 12);  // escape to larger code
197
5.34M
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
5.34M
    }
199
8.76M
    else
200
8.76M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
14.1M
  }
void GFWX::unsignedCode<5>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
5.34M
  {
193
5.34M
    uint32_t const y = x >> (pot);
194
5.34M
    if (y >= 12)
195
2.24M
    {
196
2.24M
      stream.putBits(0, 12);  // escape to larger code
197
2.24M
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
2.24M
    }
199
3.09M
    else
200
3.09M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
5.34M
  }
void GFWX::unsignedCode<9>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
2.24M
  {
193
2.24M
    uint32_t const y = x >> (pot);
194
2.24M
    if (y >= 12)
195
11.4k
    {
196
11.4k
      stream.putBits(0, 12);  // escape to larger code
197
11.4k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
11.4k
    }
199
2.23M
    else
200
2.23M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
2.24M
  }
void GFWX::unsignedCode<13>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
11.4k
  {
193
11.4k
    uint32_t const y = x >> (pot);
194
11.4k
    if (y >= 12)
195
70
    {
196
70
      stream.putBits(0, 12);  // escape to larger code
197
70
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
70
    }
199
11.3k
    else
200
11.3k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
11.4k
  }
void GFWX::unsignedCode<17>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
70
  {
193
70
    uint32_t const y = x >> (pot);
194
70
    if (y >= 12)
195
0
    {
196
0
      stream.putBits(0, 12);  // escape to larger code
197
0
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
0
    }
199
70
    else
200
70
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
70
  }
Unexecuted instantiation: void GFWX::unsignedCode<21>(unsigned int, GFWX::Bits&)
void GFWX::unsignedCode<3>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
1.05M
  {
193
1.05M
    uint32_t const y = x >> (pot);
194
1.05M
    if (y >= 12)
195
126k
    {
196
126k
      stream.putBits(0, 12);  // escape to larger code
197
126k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
126k
    }
199
926k
    else
200
926k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
1.05M
  }
void GFWX::unsignedCode<7>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
126k
  {
193
126k
    uint32_t const y = x >> (pot);
194
126k
    if (y >= 12)
195
12.0k
    {
196
12.0k
      stream.putBits(0, 12);  // escape to larger code
197
12.0k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
12.0k
    }
199
114k
    else
200
114k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
126k
  }
void GFWX::unsignedCode<11>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
12.0k
  {
193
12.0k
    uint32_t const y = x >> (pot);
194
12.0k
    if (y >= 12)
195
178
    {
196
178
      stream.putBits(0, 12);  // escape to larger code
197
178
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
178
    }
199
11.8k
    else
200
11.8k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
12.0k
  }
void GFWX::unsignedCode<15>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
178
  {
193
178
    uint32_t const y = x >> (pot);
194
178
    if (y >= 12)
195
0
    {
196
0
      stream.putBits(0, 12);  // escape to larger code
197
0
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
0
    }
199
178
    else
200
178
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
178
  }
Unexecuted instantiation: void GFWX::unsignedCode<19>(unsigned int, GFWX::Bits&)
Unexecuted instantiation: void GFWX::unsignedCode<23>(unsigned int, GFWX::Bits&)
void GFWX::unsignedCode<4>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
12.6M
  {
193
12.6M
    uint32_t const y = x >> (pot);
194
12.6M
    if (y >= 12)
195
4.77M
    {
196
4.77M
      stream.putBits(0, 12);  // escape to larger code
197
4.77M
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
4.77M
    }
199
7.84M
    else
200
7.84M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
12.6M
  }
void GFWX::unsignedCode<8>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
4.77M
  {
193
4.77M
    uint32_t const y = x >> (pot);
194
4.77M
    if (y >= 12)
195
77.5k
    {
196
77.5k
      stream.putBits(0, 12);  // escape to larger code
197
77.5k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
77.5k
    }
199
4.69M
    else
200
4.69M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
4.77M
  }
void GFWX::unsignedCode<12>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
77.5k
  {
193
77.5k
    uint32_t const y = x >> (pot);
194
77.5k
    if (y >= 12)
195
97
    {
196
97
      stream.putBits(0, 12);  // escape to larger code
197
97
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
97
    }
199
77.4k
    else
200
77.4k
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
77.5k
  }
void GFWX::unsignedCode<16>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
97
  {
193
97
    uint32_t const y = x >> (pot);
194
97
    if (y >= 12)
195
0
    {
196
0
      stream.putBits(0, 12);  // escape to larger code
197
0
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
0
    }
199
97
    else
200
97
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
97
  }
Unexecuted instantiation: void GFWX::unsignedCode<20>(unsigned int, GFWX::Bits&)
void GFWX::unsignedCode<0>(unsigned int, GFWX::Bits&)
Line
Count
Source
192
2.73M
  {
193
2.73M
    uint32_t const y = x >> (pot);
194
2.73M
    if (y >= 12)
195
572k
    {
196
572k
      stream.putBits(0, 12);  // escape to larger code
197
572k
      unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);
198
572k
    }
199
2.15M
    else
200
2.15M
      stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary
201
2.73M
  }
202
203
  template<int pot> uint32_t unsignedDecode(Bits & stream)
204
  {
205
    uint32_t x = stream.getZeros(12);
206
    int const p = pot < 24 ? pot : 24;  // actual pot. The max 108 below is to prevent unlimited recursion in malformed files, yet admit 2^32 - 1.
207
    return (pot < 108 && x == 12) ? (12 << p) + unsignedDecode<pot < 108 ? pot + 4 : 108>(stream) : p ? (x << p) + stream.getBits(p) : x;
208
  }
209
210
  template<int pot> void interleavedCode(int x, Bits & stream)
211
12.2M
  {
212
12.2M
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
12.2M
  }
void GFWX::interleavedCode<1>(int, GFWX::Bits&)
Line
Count
Source
211
8.07M
  {
212
8.07M
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
8.07M
  }
void GFWX::interleavedCode<0>(int, GFWX::Bits&)
Line
Count
Source
211
2.73M
  {
212
2.73M
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
2.73M
  }
void GFWX::interleavedCode<2>(int, GFWX::Bits&)
Line
Count
Source
211
358k
  {
212
358k
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
358k
  }
void GFWX::interleavedCode<3>(int, GFWX::Bits&)
Line
Count
Source
211
424k
  {
212
424k
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
424k
  }
void GFWX::interleavedCode<4>(int, GFWX::Bits&)
Line
Count
Source
211
632k
  {
212
632k
    unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values
213
632k
  }
214
215
  template<int pot> int interleavedDecode(Bits & stream)
216
  {
217
    int const x = unsignedDecode<pot>(stream);
218
    return (x & 1) ? (x + 1) / 2 : -x / 2;
219
  }
220
221
  template<int pot> void signedCode(int x, Bits & stream)
222
12.1M
  {
223
12.1M
    unsignedCode<pot>(abs(x), stream);
224
12.1M
    if (x)
225
10.6M
      stream.putBits(x > 0 ? 1 : 0, 1);
226
12.1M
  }
void GFWX::signedCode<2>(int, GFWX::Bits&)
Line
Count
Source
222
312k
  {
223
312k
    unsignedCode<pot>(abs(x), stream);
224
312k
    if (x)
225
227k
      stream.putBits(x > 0 ? 1 : 0, 1);
226
312k
  }
void GFWX::signedCode<4>(int, GFWX::Bits&)
Line
Count
Source
222
10.8M
  {
223
10.8M
    unsignedCode<pot>(abs(x), stream);
224
10.8M
    if (x)
225
9.77M
      stream.putBits(x > 0 ? 1 : 0, 1);
226
10.8M
  }
void GFWX::signedCode<1>(int, GFWX::Bits&)
Line
Count
Source
222
304k
  {
223
304k
    unsignedCode<pot>(abs(x), stream);
224
304k
    if (x)
225
165k
      stream.putBits(x > 0 ? 1 : 0, 1);
226
304k
  }
void GFWX::signedCode<3>(int, GFWX::Bits&)
Line
Count
Source
222
618k
  {
223
618k
    unsignedCode<pot>(abs(x), stream);
224
618k
    if (x)
225
460k
      stream.putBits(x > 0 ? 1 : 0, 1);
226
618k
  }
227
228
  template<int pot> int signedDecode(Bits & stream)
229
  {
230
    int x = unsignedDecode<pot>(stream);
231
    return x ? stream.getBits(1) ? x : -x : 0;
232
  }
233
234
  template<typename T> T median(T a, T b, T c)
235
152M
  {
236
152M
    return a < b ? c > b ? b : c < a ? a : c : c > a ? a : c < b ? b : c;
237
152M
  }
238
239
  template<typename T> T roundFraction(T num, T denom)
240
152M
  {
241
152M
    return num < 0 ? (num - denom / 2) / denom : (num + denom / 2) / denom;
242
152M
  }
243
244
  template<typename T> T cubic(T c0, T c1, T c2, T c3)
245
152M
  {
246
152M
    return median(T(roundFraction((-c0 + 9 * (c1 + c2) - c3), 16)), c1, c2);
247
152M
  }
248
249
  template<typename T> void lift(Image<T> & image, int x0, int y0, int x1, int y1, int step, int filter)
250
21.9k
  {
251
21.9k
    int const sizex = x1 - x0;
252
21.9k
    int const sizey = y1 - y0;
253
127k
    while (step < sizex || step < sizey)
254
105k
    {
255
105k
      if (step < sizex)  // horizontal lifting
256
51.2k
      {
257
51.2k
        OMP_PARALLEL_FOR(ThreadIterations)
258
22.6M
        for (int y = 0; y < sizey; y += step)
259
22.5M
        {
260
22.5M
          int x;
261
22.5M
          T * base = &image[y0 + y][x0], * base1 = base - step, * base2 = base + step, * base3 = base + step * 3;
262
22.5M
          if (filter == FilterCubic)
263
13.3M
          {
264
13.3M
            T c0 = *base, c1 = *base, c2 = step * 2 < sizex ? base[step * 2] : *base, c3;
265
28.9M
            for (x = step; x < sizex - step * 3; x += step * 2, c0 = c1, c1 = c2, c2 = c3)
266
15.6M
              base[x] -= cubic(c0, c1, c2, c3 = base3[x]);
267
30.3M
            for (; x < sizex; x += step * 2, c0 = c1, c1 = c2)
268
17.0M
              base[x] -= cubic(c0, c1, c2, c2);
269
13.3M
            T g0 = base[step], g1 = base[step], g2 = step * 3 < sizex ? base[step * 3] : base[step], g3;
270
28.2M
            for (x = step * 2; x < sizex - step * 3; x += step * 2, g0 = g1, g1 = g2, g2 = g3)
271
14.9M
              base[x] += cubic(g0, g1, g2, g3 = base3[x]) / 2;
272
19.6M
            for (; x < sizex; x += step * 2, g0 = g1, g1 = g2)
273
6.34M
              base[x] += cubic(g0, g1, g2, g2) / 2;
274
13.3M
          }
275
9.24M
          else
276
9.24M
          {
277
36.7M
            for (x = step; x < sizex - step; x += step * 2)
278
27.5M
              base[x] -= (base1[x] + base2[x]) / 2;
279
9.24M
            if (x < sizex)
280
7.70M
              base[x] -= base1[x];
281
35.2M
            for (x = step * 2; x < sizex - step; x += step * 2)
282
25.9M
              base[x] += (base1[x] + base2[x]) / 4;
283
9.24M
            if (x < sizex)
284
1.54M
              base[x] += base1[x] / 2;
285
9.24M
          }
286
22.5M
        }
287
51.2k
      }
288
105k
      if (step < sizey)  // vertical lifting
289
74.9k
      {
290
74.9k
        OMP_PARALLEL_FOR(ThreadIterations)
291
71.0M
        for (int y = step; y < sizey; y += step * 2)
292
70.9M
        {
293
70.9M
          T * const base = &image[y0 + y][x0];
294
70.9M
          T const * const c1base = &image[y0 + y - step][x0], * const c2base = y + step < sizey ? &image[y0 + y + step][x0] : c1base;
295
70.9M
          if (filter == FilterCubic)
296
37.9M
          {
297
37.9M
            T const * const c0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : c1base;
298
37.9M
            T const * const c3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : c2base;
299
87.5M
            for (int x = 0; x < sizex; x += step)
300
49.6M
              base[x] -= cubic(c0base[x], c1base[x], c2base[x], c3base[x]);
301
37.9M
          }
302
75.8M
          else for (int x = 0; x < sizex; x += step)
303
42.8M
            base[x] -= (c1base[x] + c2base[x]) / 2;
304
70.9M
        }
305
74.9k
        OMP_PARALLEL_FOR(ThreadIterations)
306
70.9M
        for (int y = step * 2; y < sizey; y += step * 2)
307
70.9M
        {
308
70.9M
          T * const base = &image[y0 + y][x0];
309
70.9M
          T const * const g1base = &image[y0 + y - step][x0], * const g2base = y + step < sizey ? &image[y0 + y + step][x0] : g1base;
310
70.9M
          if (filter == FilterCubic)
311
37.8M
          {
312
37.8M
            T const * const g0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : g1base;
313
37.8M
            T const * const g3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : g2base;
314
86.9M
            for (int x = 0; x < sizex; x += step)
315
49.0M
              base[x] += cubic(g0base[x], g1base[x], g2base[x], g3base[x]) / 2;
316
37.8M
          }
317
74.7M
          else for (int x = 0; x < sizex; x += step)
318
41.7M
            base[x] += (g1base[x] + g2base[x]) / 4;
319
70.9M
        }
320
74.9k
      }
321
105k
      step *= 2;
322
105k
    }
323
21.9k
  }
324
325
  template<typename T> void unlift(Image<T> & image, int x0, int y0, int x1, int y1, int minStep, int filter)
326
  {
327
    int const sizex = x1 - x0;
328
    int const sizey = y1 - y0;
329
    int step = minStep;
330
    while (step * 2 < sizex || step * 2 < sizey)
331
      step *= 2;
332
    while (step >= minStep)
333
    {
334
      if (step < sizey) // vertical unlifting
335
      {
336
        OMP_PARALLEL_FOR(ThreadIterations)
337
        for (int y = step * 2; y < sizey; y += step * 2)
338
        {
339
          T * const base = &image[y0 + y][x0];
340
          T const * const g1base = &image[y0 + y - step][x0], * const g2base = y + step < sizey ? &image[y0 + y + step][x0] : g1base;
341
          if (filter == FilterCubic)
342
          {
343
            T const * const g0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : g1base;
344
            T const * const g3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : g2base;
345
            for (int x = 0; x < sizex; x += step)
346
              base[x] -= cubic(g0base[x], g1base[x], g2base[x], g3base[x]) / 2;
347
          }
348
          else for (int x = 0; x < sizex; x += step)
349
            base[x] -= (g1base[x] + g2base[x]) / 4;
350
        }
351
        OMP_PARALLEL_FOR(ThreadIterations)
352
        for (int y = step; y < sizey; y += step * 2)
353
        {
354
          T * const base = &image[y0 + y][x0];
355
          T const * const c1base = &image[y0 + y - step][x0], * const c2base = y + step < sizey ? &image[y0 + y + step][x0] : c1base;
356
          if (filter == FilterCubic)
357
          {
358
            T const * const c0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : c1base;
359
            T const * const c3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : c2base;
360
            for (int x = 0; x < sizex; x += step)
361
              base[x] += cubic(c0base[x], c1base[x], c2base[x], c3base[x]);
362
          }
363
          else for (int x = 0; x < sizex; x += step)
364
            base[x] += (c1base[x] + c2base[x]) / 2;
365
        }
366
      }
367
      if (step < sizex) // horizontal unlifting
368
      {
369
        OMP_PARALLEL_FOR(ThreadIterations)
370
        for (int y = 0; y < sizey; y += step)
371
        {
372
          int x;
373
          T * base = &image[y0 + y][x0], * base1 = base - step, * base2 = base + step, * base3 = base + step * 3;
374
          if (filter == FilterCubic)
375
          {
376
            T g0 = base[step], g1 = base[step], g2 = step * 3 < sizex ? base[step * 3] : base[step], g3;
377
            for (x = step * 2; x < sizex - step * 3; x += step * 2, g0 = g1, g1 = g2, g2 = g3)
378
              base[x] -= cubic(g0, g1, g2, g3 = base3[x]) / 2;
379
            for (; x < sizex; x += step * 2, g0 = g1, g1 = g2)
380
              base[x] -= cubic(g0, g1, g2, g2) / 2;
381
            T c0 = *base, c1 = *base, c2 = step * 2 < sizex ? base[step * 2] : *base, c3;
382
            for (x = step; x < sizex - step * 3; x += step * 2, c0 = c1, c1 = c2, c2 = c3)
383
              base[x] += cubic(c0, c1, c2, c3 = base3[x]);
384
            for (; x < sizex; x += step * 2, c0 = c1, c1 = c2)
385
              base[x] += cubic(c0, c1, c2, c2);
386
          }
387
          else
388
          {
389
            for (x = step * 2; x < sizex - step; x += step * 2)
390
              base[x] -= (base1[x] + base2[x]) / 4;
391
            if (x < sizex)
392
              base[x] -= base1[x] / 2;
393
            for (x = step; x < sizex - step; x += step * 2)
394
              base[x] += (base1[x] + base2[x]) / 2;
395
            if (x < sizex)
396
              base[x] += base1[x];
397
          }
398
        }
399
      }
400
      step /= 2;
401
    }
402
  }
403
404
  template<typename T, bool dequantize> void quantize(Image<T> & image, int x0, int y0, int x1, int y1, int step, int quality, int minQ, int maxQ)
405
21.9k
  {
406
21.9k
    typedef typename std::conditional<sizeof(T) < 4, int32_t, int64_t>::type aux;
407
21.9k
    int const sizex = x1 - x0;
408
21.9k
    int const sizey = y1 - y0;
409
21.9k
    int skip = step;
410
38.7k
    while (skip < sizex && skip < sizey)
411
17.8k
    {
412
17.8k
      int const q = std::max(std::max(1, minQ), quality);
413
17.8k
      if (q >= maxQ) break;
414
16.7k
      OMP_PARALLEL_FOR(ThreadIterations)
415
21.4M
      for (int y = 0; y < sizey; y += skip)
416
21.4M
      {
417
21.4M
        T * base = &image[y0 + y][x0];
418
21.4M
        int const xStep = (y & skip) ? skip : skip * 2;
419
89.2M
        for (int x = xStep - skip; x < sizex; x += xStep)  // [NOTE] arranged so that (x | y) & skip == 1
420
67.7M
          base[x] = dequantize ? (aux(base[x]) * maxQ + (base[x] < 0 ? -maxQ / 2 : base[x] > 0 ? maxQ / 2 : 0)) / q : aux(base[x]) * q / maxQ;
421
21.4M
      }
422
16.7k
      skip *= 2;
423
16.7k
      quality = std::min(maxQ, quality * 2);  // [MAGIC] This approximates the JPEG 2000 baseline quantizer
424
16.7k
    }
425
21.9k
  }
426
427
  template<typename T> T square(T t)
428
48.4M
  {
429
48.4M
    return t * t;
430
48.4M
  }
431
432
  inline void addContext(int x, int w, uint32_t & sum, uint32_t & sum2, uint32_t & count)
433
26.9M
  {
434
26.9M
    sum += uint32_t(x = abs(x)) * w;
435
26.9M
    sum2 += square(std::min(uint32_t(x), 4096u)) * w; // [MAGIC] avoid overflow in last line of getContext
436
26.9M
    count += w;
437
26.9M
  }
438
439
  template<typename T> std::pair<uint32_t, uint32_t> getContext(Image<T> & image, int x0, int y0, int x1, int y1, int x, int y, int skip)
440
12.2M
  {
441
12.2M
    int px = x0 + (x & ~(skip * 2)) + (x & skip);
442
12.2M
    if (px >= x1)
443
927k
      px -= skip * 2;
444
12.2M
    int py = y0 + (y & ~(skip * 2)) + (y & skip);
445
12.2M
    if (py >= y1)
446
681k
      py -= skip * 2;
447
12.2M
    uint32_t count = 0, sum = 0, sum2 = 0;
448
12.2M
    addContext(abs(image[py][px]), 2, sum, sum2, count);  // ancestor
449
12.2M
    if ((y & skip) && (x | skip) < x1 - x0)
450
2.96M
    {
451
2.96M
      addContext(image[y0 + y - skip][x0 + (x | skip)], 2, sum, sum2, count); // upper sibling
452
2.96M
      if (x & skip)
453
1.47M
        addContext(image[y0 + y][x0 + x - skip], 2, sum, sum2, count); // left sibling
454
2.96M
    }
455
12.2M
    if (y >= skip * 2 && x >= skip * 2)  // neighbors
456
1.69M
    {
457
1.69M
      addContext(image[y0 + y - skip * 2][x0 + x], 4, sum, sum2, count);
458
1.69M
      addContext(image[y0 + y][x0 + x - skip * 2], 4, sum, sum2, count);
459
1.69M
      addContext(image[y0 + y - skip * 2][x0 + x - skip * 2], 2, sum, sum2, count);
460
1.69M
      if (x + skip * 2 < x1 - x0)
461
1.48M
        addContext(image[y0 + y - skip * 2][x0 + x + skip * 2], 2, sum, sum2, count);
462
1.69M
      if (y >= skip * 4 && x >= skip * 4)
463
983k
      {
464
983k
        addContext(image[y0 + y - skip * 4][x0 + x], 2, sum, sum2, count);
465
983k
        addContext(image[y0 + y][x0 + x - skip * 4], 2, sum, sum2, count);
466
983k
        addContext(image[y0 + y - skip * 4][x0 + x - skip * 4], 1, sum, sum2, count);
467
983k
        if (x + skip * 4 < x1 - x0)
468
794k
          addContext(image[y0 + y - skip * 4][x0 + x + skip * 4], 1, sum, sum2, count);
469
983k
      }
470
1.69M
    }
471
12.2M
    return std::make_pair((sum * 16u + count / 2u) / count, (sum2 * 16u + count / 2u) / count); // set sums relative to 16 count
472
12.2M
  }
473
474
  template<typename T> void encode(Image<T> & image, Bits & stream, int x0, int y0, int x1, int y1, int step, int scheme, int q, bool hasDC, bool isChroma)
475
8.32M
  {
476
8.32M
    int const sizex = x1 - x0;
477
8.32M
    int const sizey = y1 - y0;
478
8.32M
    if (hasDC && sizex > 0 && sizey > 0)
479
15.1k
      signedCode<4>(image[y0][x0], stream);
480
8.32M
    std::pair<uint32_t, uint32_t> context(0, 0);
481
8.32M
    int run = 0, runCoder = (scheme == EncoderTurbo ? (!q || (step < 2048 && q * step < 2048)) ? 1 : 0 : 0);  // avoid overflow checking q * step < 2048
482
99.7M
    for (int y = 0; y < sizey; y += step)
483
91.4M
    {
484
91.4M
      T * base = &image[y0 + y][x0];
485
91.4M
      int const xStep = (y & step) ? step : step * 2;
486
185M
      for (int x = xStep - step; x < sizex; x += xStep)  // [NOTE] arranged so that (x | y) & step == 1
487
94.3M
      {
488
94.3M
        T s = base[x];
489
94.3M
        if (runCoder && !s)  // run
490
70.0M
          ++ run;
491
24.3M
        else
492
24.3M
        {
493
24.3M
          if (scheme == EncoderTurbo)
494
7.48M
          {
495
7.48M
            if (runCoder)  // break the run
496
4.39M
            {
497
4.39M
              unsignedCode<1>(run, stream);
498
4.39M
              run = 0;
499
4.39M
              interleavedCode<1>(s < 0 ? s + 1 : s, stream);  // s can't be zero, so shift negatives by 1
500
4.39M
            }
501
3.08M
            else
502
3.08M
              interleavedCode<1>(s, stream);
503
7.48M
            continue;
504
7.48M
          }
505
16.8M
          if (runCoder)  // break the run
506
380k
          {
507
380k
            runCoder == 1 ? unsignedCode<1>(run, stream) : runCoder == 2 ? unsignedCode<2>(run, stream)
508
282k
              : runCoder == 3 ? unsignedCode<3>(run, stream) : unsignedCode<4>(run, stream);
509
380k
            run = 0;
510
380k
            if (s < 0)
511
191k
              ++ s; // s can't be zero, so shift negatives by 1
512
380k
          }
513
16.8M
          if (scheme == EncoderContextual)
514
12.2M
            context = getContext(image, x0, y0, x1, y1, x, y, step);
515
16.8M
          uint32_t const sumSq = square(context.first);
516
16.8M
          if (sumSq < 2u * context.second + (isChroma ? 250u : 100u))
517
2.73M
            interleavedCode<0>(s, stream);
518
14.1M
          else if (sumSq < 2u * context.second + 950u)
519
590k
            interleavedCode<1>(s, stream);
520
13.5M
          else if (sumSq < 3u * context.second + 3000u)
521
663k
          {
522
663k
            if (sumSq < 5u * context.second + 400u)
523
304k
              signedCode<1>(s, stream);
524
358k
            else
525
358k
              interleavedCode<2>(s, stream);
526
663k
          }
527
12.8M
          else if (sumSq < 3u * context.second + 12000u)
528
716k
          {
529
716k
            if (sumSq < 5u * context.second + 3000u)
530
292k
              signedCode<2>(s, stream);
531
424k
            else
532
424k
              interleavedCode<3>(s, stream);
533
716k
          }
534
12.1M
          else if (sumSq < 4u * context.second + 44000u)
535
1.25M
          {
536
1.25M
            if (sumSq < 6u * context.second + 12000u)
537
618k
              signedCode<3>(s, stream);
538
632k
            else
539
632k
              interleavedCode<4>(s, stream);
540
1.25M
          }
541
10.8M
          else
542
10.8M
            signedCode<4>(s, stream);
543
16.8M
          if (scheme == EncoderFast)  // use decaying first and second moment
544
4.61M
          {
545
4.61M
            uint32_t const t = abs(s);
546
4.61M
            context = std::make_pair(((context.first * 15u + 7u) >> 4) + t, ((context.second * 15u + 7u) >> 4) + square(std::min(t, 4096u)));
547
4.61M
            if (!s == !runCoder)
548
1.99M
              runCoder = context.first < 1 ? 4 : context.first < 2 ? 3 : context.first < 4 ? 2 : context.first < 8 ? 1 : 0;
549
4.61M
          }
550
12.2M
          else if (!s == !runCoder)
551
2.17M
            runCoder = q == 1024 ? context.first < 2u ? 1 : 0 : (context.first < 4u && context.second < 2u) ? 4 : (context.first < 8u
552
1.64M
            && context.second < 4u) ? 3 : (2u * sumSq < 3u * context.second + 48u) ? 2 : (2u * sumSq < 5u * context.second + 32u) ? 1 : 0;
553
16.8M
        }
554
94.3M
      }
555
91.4M
    }
556
8.32M
    if (run)  // flush run
557
1.53M
      runCoder == 1 ? unsignedCode<1>(run, stream) : runCoder == 2 ? unsignedCode<2>(run, stream)
558
300k
        : runCoder == 3 ? unsignedCode<3>(run, stream) : unsignedCode<4>(run, stream);
559
8.32M
  }
560
561
  template<typename T> void decode(Image<T> & image, Bits & stream, int x0, int y0, int x1, int y1, int step, int scheme, int q, bool hasDC, bool isChroma)
562
  {
563
    int const sizex = x1 - x0;
564
    int const sizey = y1 - y0;
565
    if (hasDC && sizex > 0 && sizey > 0)
566
      image[y0][x0] = signedDecode<4>(stream);
567
    std::pair<uint32_t, uint32_t> context(0, 0);
568
    int run = -1, runCoder = (scheme == EncoderTurbo ? (!q || (step < 2048 && q * step < 2048)) ? 1 : 0 : 0);  // avoid overflow checking q * step < 2048
569
    for (int y = 0; y < sizey; y += step)
570
    {
571
      T * base = &image[y0 + y][x0];
572
      int const xStep = (y & step) ? step : step * 2;
573
      for (int x = xStep - step; x < sizex; x += xStep) // [NOTE] arranged so that (x | y) & step == 1
574
      {
575
        T s = 0;
576
        if (runCoder && run == -1)
577
          run = runCoder == 1 ? unsignedDecode<1>(stream) : runCoder == 2 ? unsignedDecode<2>(stream)
578
            : runCoder == 3 ? unsignedDecode<3>(stream) : unsignedDecode<4>(stream);
579
        if (run > 0)
580
          -- run; // consume a zero
581
        else
582
        {
583
          if (scheme == EncoderTurbo)
584
            s = interleavedDecode<1>(stream);
585
          else
586
          {
587
            if (scheme == EncoderContextual)
588
              context = getContext(image, x0, y0, x1, y1, x, y, step);
589
            uint32_t const sumSq = square(context.first);
590
            if (sumSq < 2u * context.second + (isChroma ? 250u : 100u))
591
              s = interleavedDecode<0>(stream);
592
            else if (sumSq < 2u * context.second + 950u)
593
              s = interleavedDecode<1>(stream);
594
            else if (sumSq < 3u * context.second + 3000u)
595
            {
596
              if (sumSq < 5u * context.second + 400u)
597
                s = signedDecode<1>(stream);
598
              else
599
                s = interleavedDecode<2>(stream);
600
            }
601
            else if (sumSq < 3u * context.second + 12000u)
602
            {
603
              if (sumSq < 5u * context.second + 3000u)
604
                s = signedDecode<2>(stream);
605
              else
606
                s = interleavedDecode<3>(stream);
607
            }
608
            else if (sumSq < 4u * context.second + 44000u)
609
            {
610
              if (sumSq < 6u * context.second + 12000u)
611
                s = signedDecode<3>(stream);
612
              else
613
                s = interleavedDecode<4>(stream);
614
            }
615
            else
616
              s = signedDecode<4>(stream);
617
            if (scheme == EncoderFast)  // use decaying first and second moment
618
            {
619
              uint32_t const t = abs(s);
620
              context = std::make_pair(((context.first * 15u + 7u) >> 4) + t, ((context.second * 15u + 7u) >> 4) + square(std::min(t, 4096u)));
621
              if (!s == !runCoder)
622
                runCoder = context.first < 1 ? 4 : context.first < 2 ? 3 : context.first < 4 ? 2 : context.first < 8 ? 1 : 0;
623
            }
624
            else if (!s == !runCoder)
625
              runCoder = q == 1024 ? context.first < 2u ? 1 : 0 : (context.first < 4u && context.second < 2u) ? 4 : (context.first < 8u
626
              && context.second < 4u) ? 3 : (2u * sumSq < 3u * context.second + 48u) ? 2 : (2u * sumSq < 5u * context.second + 32u) ? 1 : 0;
627
          }
628
          if (run == 0 && s <= 0)
629
            -- s; // s can't be zero, so shift negatives by 1
630
          run = -1;
631
        }
632
        base[x] = s;
633
      }
634
    }
635
  }
636
637
  template<typename T> void shiftVector(T * data, int shift, int count)
638
1.05k
  {
639
1.05k
    OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
640
34.1M
    for (int i = 0; i < count; ++ i)
641
34.1M
      data[i] >>= shift;
642
1.05k
  }
643
644
  template<typename I, typename A> void transformTerm(int const * & pc, A * destination, A const * auxData, int const bufferSize,
645
    I const & imageData, Header const & header, std::vector<int> const & isChroma, int boost)
646
2.41k
  {
647
5.88k
    while (*pc >= 0)
648
3.47k
    {
649
3.47k
      int const c = *(pc ++);
650
3.47k
      A const factor = *(pc ++);
651
3.47k
      if (isChroma[c] == -1)
652
1.61k
      {
653
1.61k
        auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);
654
1.61k
        A const boostFactor = boost * factor;
655
1.61k
        OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
656
41.1M
        for (int i = 0; i < bufferSize; ++ i)
657
41.1M
          destination[i] += layer[i * header.channels] * boostFactor;
658
1.61k
      }
659
1.85k
      else
660
1.85k
      {
661
1.85k
        A const * auxDataC = auxData + c * bufferSize;
662
1.85k
        OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
663
54.7M
        for (int i = 0; i < bufferSize; ++ i)
664
54.7M
          destination[i] += auxDataC[i] * factor;
665
1.85k
      }
666
3.47k
    }
667
2.41k
    A const denom = *((++ pc) ++);
668
2.41k
    if (denom == 2)
669
246
      shiftVector(destination, 1, bufferSize);
670
2.17k
    else if (denom == 4)
671
560
      shiftVector(destination, 2, bufferSize);
672
1.61k
    else if (denom == 8)
673
246
      shiftVector(destination, 3, bufferSize);
674
1.36k
    else if (denom > 1)  // [NOTE] disallow non-positive denominators
675
0
    {
676
0
      OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
677
0
      for (int i = 0; i < bufferSize; ++ i)
678
0
        destination[i] /= denom;
679
0
    }
680
2.41k
  }
681
682
  // GFWX_TRANSFORM_UYV implements YUV (actually UYV) as R -= G (chroma); B -= G (chroma); G += (R + B) / 4 (luma)
683
3.25k
  #define GFWX_TRANSFORM_UYV { 0, 1, -1, -1, 1, 1, 2, 1, -1, -1, 1, 1, 1, 0, 1, 2, 1, -1, 4, 0, -1 }
684
  // GFWX_TRANSFORM_A710 implements A710 as R -= G (chroma); B -= (G * 2 + R) / 2 (chroma); G += (B * 2 + R * 3) / 8 (luma)
685
3.25k
  #define GFWX_TRANSFORM_A710_BGR { 2, 1, -1, -1, 1, 1, 0, 1, -2, 2, -1, -1, 2, 1, 1, 0, 2, 2, 3, -1, 8, 0, -1 }
686
3.25k
  #define GFWX_TRANSFORM_A710_RGB { 0, 1, -1, -1, 1, 1, 2, 1, -2, 0, -1, -1, 2, 1, 1, 2, 2, 0, 3, -1, 8, 0, -1 }
687
688
  template<typename I> ptrdiff_t compress(I const & imageData, Header & header, uint8_t * buffer, size_t size,
689
    int const * channelTransform, uint8_t * metaData, size_t metaDataSize)
690
3.02k
  {
691
3.02k
    typedef typename std::remove_reference<decltype(imageData[0])>::type base;
692
3.02k
    typedef typename std::conditional<sizeof(base) < 2, int16_t, int32_t>::type aux;
693
3.02k
    if (header.sizex > (1 << 30) || header.sizey > (1 << 30))  // [NOTE] current implementation can't go over 2^30
694
0
      return ErrorMalformed;
695
3.02k
    Bits stream(reinterpret_cast<uint32_t *>(buffer), reinterpret_cast<uint32_t *>(buffer) + size / 4);
696
3.02k
    stream.putBits('G' | ('F' << 8) | ('W' << 16) | ('X' << 24), 32);
697
3.02k
    stream.putBits(header.version = 1, 32);
698
3.02k
    stream.putBits(header.sizex, 32);
699
3.02k
    stream.putBits(header.sizey, 32);
700
3.02k
    stream.putBits(header.layers - 1, 16);
701
3.02k
    stream.putBits(header.channels - 1, 16);
702
3.02k
    stream.putBits((header.bitDepth ? header.bitDepth : (header.bitDepth = std::numeric_limits<base>::digits)) - 1, 8);
703
3.02k
    stream.putBits(header.isSigned = std::numeric_limits<base>::is_signed ? 1 : 0, 1);
704
3.02k
    stream.putBits(header.quality - 1, 10);
705
3.02k
    stream.putBits(header.chromaScale - 1, 8);
706
3.02k
    stream.putBits(header.blockSize - 2, 5);
707
3.02k
    stream.putBits(header.filter, 8);
708
3.02k
    stream.putBits(header.quantization, 8);
709
3.02k
    stream.putBits(header.encoder, 8);
710
3.02k
    stream.putBits(header.intent, 8);
711
3.02k
    stream.putBits(int(metaDataSize / 4), 32);
712
3.02k
    stream.buffer = std::copy(reinterpret_cast<uint32_t *>(metaData), reinterpret_cast<uint32_t *>(metaData) + metaDataSize / 4, stream.buffer);
713
3.02k
    int const bufferSize = header.sizex * header.sizey;
714
3.02k
    std::vector<aux> auxData((size_t)header.layers * header.channels * bufferSize, 0);
715
3.02k
    std::vector<int> isChroma(header.layers * header.channels, -1);
716
3.02k
    int const chromaQuality = std::max(1, (header.quality + header.chromaScale / 2) / header.chromaScale);
717
3.02k
    int const boost = header.quality == QualityMax ? 1 : 8; // [NOTE] due to Cubic lifting max multiplier of 20, boost * 20 must be less than 256
718
3.02k
    if (channelTransform)  // run color transform program (and also encode it to the file)
719
806
    {
720
806
      int const * pc = channelTransform;
721
3.22k
      while (*pc >= 0)
722
2.41k
      {
723
2.41k
        int const c = *(pc ++);
724
2.41k
        aux * destination = &auxData[c * bufferSize];
725
2.41k
        transformTerm(pc, destination, &auxData[0], bufferSize, imageData, header, isChroma, boost);
726
2.41k
        auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);
727
2.41k
        OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
728
61.7M
        for (int i = 0; i < bufferSize; ++ i)
729
61.7M
          destination[i] += layer[i * header.channels] * boost;
730
2.41k
        isChroma[c] = *(pc ++);
731
2.41k
      }
732
18.2k
      for (int const * i = channelTransform; i <= pc; ++ i)
733
17.4k
        signedCode<2>(*i, stream);
734
806
    }
735
2.21k
    else
736
2.21k
      signedCode<2>(-1, stream);
737
3.02k
    stream.flushWriteWord();
738
12.1k
    for (int c = 0; c < header.layers * header.channels; ++ c) if (isChroma[c] == -1)  // copy channels having no transform
739
6.65k
    {
740
6.65k
      aux * destination = &auxData[c * bufferSize];
741
6.65k
      auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);
742
6.65k
      OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
743
60.3M
      for (int i = 0; i < bufferSize; ++ i)
744
60.3M
        destination[i] = layer[i * header.channels] * boost;
745
6.65k
      isChroma[c] = 0;
746
6.65k
    }
747
12.1k
    for (int c = 0; c < header.layers * header.channels; ++ c)  // lift and quantize the channels
748
9.07k
    {
749
9.07k
      Image<aux> auxImage(&auxData[c * bufferSize], header.sizex, header.sizey);
750
9.07k
      lift(auxImage, 0, 0, header.sizex, header.sizey, 1, header.filter);
751
9.07k
      if (header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGeneric)
752
4.30k
      {
753
21.5k
        for (int ox = 0; ox <= 1; ++ ox) for (int oy = 1 - ox; oy <= 1; ++ oy)
754
12.9k
          lift(auxImage, ox, oy, header.sizex, header.sizey, 2, header.filter);
755
25.8k
        for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)
756
17.2k
          quantize<aux, false>(auxImage, ox, oy, header.sizex, header.sizey, 2,
757
17.2k
            (ox | oy) ? chromaQuality : header.quality, header.quality, QualityMax * boost);
758
4.30k
      }
759
4.76k
      else
760
4.76k
        quantize<aux, false>(auxImage, 0, 0, header.sizex, header.sizey, 1, isChroma[c] ? chromaQuality : header.quality, 0, QualityMax * boost);
761
9.07k
    }
762
3.02k
    int step = 1;
763
16.8k
    while (step * 2 < header.sizex || step * 2 < header.sizey)
764
13.7k
      step *= 2;
765
13.4k
    for (bool hasDC = true; step >= 1; hasDC = false)
766
13.1k
    {
767
13.1k
      int64_t const bs = int64_t(step) << header.blockSize;
768
13.1k
      int const blockCountX = (header.sizex + bs - 1) / bs;
769
13.1k
      int const blockCountY = (header.sizey + bs - 1) / bs;
770
13.1k
      int const blockCount = blockCountX * blockCountY * header.layers * header.channels;
771
13.1k
      std::vector<Bits> streamBlock(blockCount, Bits(0, 0));
772
13.1k
      uint32_t * blockBegin = stream.buffer + blockCount; // leave space for block sizes
773
13.1k
      if (blockBegin >= stream.bufferEnd)
774
687
        return ErrorOverflow;
775
3.51M
      for (int block = 0; block < blockCount; ++ block)  // partition buffer into temporary regions for each block
776
3.50M
        streamBlock[block].buffer = blockBegin + (stream.bufferEnd - blockBegin) * block / blockCount;
777
3.51M
      for (int block = 0; block < blockCount; ++ block)
778
3.50M
        streamBlock[block].bufferEnd = block + 1 < blockCount ? streamBlock[block + 1].buffer : stream.bufferEnd;
779
12.4k
      OMP_PARALLEL_FOR(4) // [MAGIC] for some reason, 4 is by far the best option here
780
3.51M
      for (int block = 0; block < blockCount; ++ block)
781
3.50M
      {
782
3.50M
        int const bx = block % blockCountX, by = (block / blockCountX) % blockCountY, c = block / (blockCountX * blockCountY);
783
3.50M
        Image<aux> auxImage(&auxData[c * bufferSize], header.sizex, header.sizey);
784
3.50M
        if (header.intent < IntentBayerRGGB || header.intent > IntentBayerGeneric)
785
1.90M
          encode(auxImage, streamBlock[block], bx * bs, by * bs,
786
1.90M
          int(std::min((bx + 1) * bs, int64_t(header.sizex))), int(std::min((by + 1) * bs, int64_t(header.sizey))),
787
1.90M
          step, header.encoder, isChroma[c] ? chromaQuality : header.quality, hasDC && !bx && !by, isChroma[c] != 0);
788
9.63M
        else for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)
789
6.42M
          encode(auxImage, streamBlock[block], bx * bs + ox, by * bs + oy,
790
6.42M
          int(std::min((bx + 1) * bs, int64_t(header.sizex))), int(std::min((by + 1) * bs, int64_t(header.sizey))),
791
6.42M
          2 * step, header.encoder, (ox || oy) ? chromaQuality : header.quality, hasDC && !bx && !by, ox || oy);
792
3.50M
        streamBlock[block].flushWriteWord();
793
3.50M
      }
794
2.17M
      for (int block = 0; block < blockCount; ++ block)  // check streamBlocks for overflow
795
2.16M
        if (streamBlock[block].indexBits < 0)
796
2.05k
          return ErrorOverflow;
797
2.15M
      for (int block = 0; block < blockCount; ++ block)  // encode block lengths [NOTE] this 32-bit encoding limits the file size to < 16 GB
798
2.14M
        *(stream.buffer ++) = uint32_t(streamBlock[block].buffer - (block ? streamBlock[block - 1].bufferEnd : blockBegin));
799
2.15M
      for (int block = 0; block < blockCount; ++ block)  // pack the streamBlock data tightly, by word [NOTE] first block is already packed
800
2.14M
        stream.buffer = block ? std::copy(streamBlock[block - 1].bufferEnd, streamBlock[block].buffer, stream.buffer) : streamBlock[0].buffer;
801
10.4k
      step /= 2;
802
10.4k
    }
803
284
    return reinterpret_cast<uint8_t *>(stream.buffer) - buffer; // return size in bytes
804
3.02k
  }
805
806
  template<typename I> ptrdiff_t decompress(I const & imageData, Header & header, uint8_t const * data, size_t size, int downsampling, bool test)
807
  {
808
    typedef typename std::remove_reference<decltype(imageData[0])>::type base;
809
    typedef typename std::conditional<sizeof(base) < 2, int16_t, int32_t>::type aux;
810
    Bits stream(reinterpret_cast<uint32_t *>(const_cast<uint8_t *>(data)), reinterpret_cast<uint32_t *>(const_cast<uint8_t *>(data)) + size / 4);
811
    if (size < 28)  // at least load the header
812
      return 28;
813
    if (stream.getBits(32) != uint32_t('G' | ('F' << 8) | ('W' << 16) | ('X' << 24)))
814
      return ErrorMalformed;
815
    header.version = stream.getBits(32);
816
    header.sizex = stream.getBits(32);
817
    header.sizey = stream.getBits(32);
818
    header.layers = stream.getBits(16) + 1;
819
    header.channels = stream.getBits(16) + 1;
820
    header.bitDepth = stream.getBits(8) + 1;
821
    header.isSigned = stream.getBits(1);
822
    header.quality = stream.getBits(10) + 1;
823
    header.chromaScale = stream.getBits(8) + 1;
824
    header.blockSize = stream.getBits(5) + 2;
825
    header.filter = stream.getBits(8);
826
    header.quantization = stream.getBits(8);
827
    header.encoder = stream.getBits(8);
828
    header.intent = stream.getBits(8);
829
    if (header.sizex < 0 || header.sizex > (1 << 30) || header.sizey < 0 || header.sizey > (1 << 30) || header.bufferSize() == 0)
830
      return ErrorMalformed;  // [NOTE] current implementation can't go over 2^30
831
    if (!imageData)   // just header
832
      return ResultOk;
833
    if (header.isSigned != (std::numeric_limits<base>::is_signed ? 1 : 0) || header.bitDepth > std::numeric_limits<base>::digits)
834
      return ErrorTypeMismatch; // check for correct buffer type (though doesn't test the buffer size)
835
    // [NOTE] clients can read metadata themselves by accessing the size (in words) at word[7] and the metadata at word[8+]
836
    if ((stream.buffer += stream.getBits(32)) >= stream.bufferEnd) // skip metadata
837
      return reinterpret_cast<uint8_t *>(stream.buffer) - data; // suggest point of interest to skip metadata
838
    int const sizexDown = (header.sizex + (1 << downsampling) - 1) >> downsampling, sizeyDown = (header.sizey + (1 << downsampling) - 1) >> downsampling;
839
    int const bufferSize = sizexDown * sizeyDown;
840
    std::vector<aux> auxData((size_t)header.layers * header.channels * bufferSize, 0);
841
    std::vector<int> isChroma(header.layers * header.channels, 0), transformProgram, transformSteps;
842
    size_t nextPointOfInterest = size + 1024; // guess next point of interest [NOTE] may be larger than the complete file
843
    while (true)  // decode color transform program (including isChroma flags)
844
    {
845
      transformProgram.push_back(signedDecode<2>(stream));  // channel
846
      if (transformProgram.back() >= static_cast<int>(isChroma.size()))
847
          return ErrorMalformed;
848
      if (transformProgram.back() < 0)
849
        break;
850
      transformSteps.push_back(int(transformProgram.size()) - 1);
851
      while (true)
852
      {
853
        if (stream.indexBits < 0) // test for truncation
854
          return nextPointOfInterest; // need more data
855
        transformProgram.push_back(signedDecode<2>(stream));  // other channel
856
        if (transformProgram.back() >= static_cast<int>(isChroma.size()))
857
            return ErrorMalformed;
858
        if (transformProgram.back() < 0)
859
          break;
860
        transformProgram.push_back(signedDecode<2>(stream));  // factor
861
      }
862
      transformProgram.push_back(signedDecode<2>(stream));  // denominator
863
      transformProgram.push_back(signedDecode<2>(stream));  // chroma flag
864
      isChroma[transformProgram[transformSteps.back()]] = transformProgram.back();
865
    }
866
    stream.flushReadWord();
867
    int const chromaQuality = std::max(1, (header.quality + header.chromaScale / 2) / header.chromaScale);
868
    int const boost = header.quality == QualityMax ? 1 : 8; // [NOTE] due to Cubic lifting max multiplier of 20, boost * 20 must be less than 256
869
    bool isTruncated = false;
870
    int step = 1;
871
    while (step * 2 < header.sizex || step * 2 < header.sizey)
872
      step *= 2;
873
    for (bool hasDC = true; (step >> downsampling) >= 1; hasDC = false) // decode just enough coefficients for downsampled image
874
    {
875
      int64_t const bs = int64_t(step) << header.blockSize;
876
      int const blockCountX = int((header.sizex + bs - 1) / bs);
877
      int const blockCountY = int((header.sizey + bs - 1) / bs);
878
      int const blockCount = blockCountX * blockCountY * header.layers * header.channels;
879
      isTruncated = true;
880
      if (stream.buffer + 1 + blockCount > stream.bufferEnd)  // check for enough buffer to read block sizes
881
        break;
882
      std::vector<Bits> streamBlock(blockCount, Bits(0, 0));
883
      for (int block = 0; block < blockCount; ++ block) // first, read sizes into bufferEnd pointers
884
        streamBlock[block].bufferEnd = static_cast<uint32_t *>(0) + *(stream.buffer ++);
885
      for (int block = 0; block < blockCount; ++ block) // then convert sizes to true buffer pointers
886
        streamBlock[block].bufferEnd = (streamBlock[block].buffer = block ? streamBlock[block - 1].bufferEnd : stream.buffer)
887
                        + (streamBlock[block].bufferEnd - static_cast<uint32_t *>(0));
888
      stream.buffer = streamBlock[blockCount - 1].bufferEnd;
889
      nextPointOfInterest = reinterpret_cast<uint8_t *>(stream.buffer + ((step >> downsampling) > 1 ? blockCount * 4 : 0)) - data;
890
      if (stream.buffer <= stream.bufferEnd)
891
        isTruncated = false;
892
      int const stepDown = step >> downsampling;
893
      int64_t const bsDown = int64_t(stepDown) << header.blockSize;
894
      OMP_PARALLEL_FOR(4) // [MAGIC] for some reason, 4 is by far the best option here
895
      for (int block = 0; block < blockCount; ++ block) if (!test && streamBlock[block].bufferEnd <= stream.bufferEnd)
896
      {
897
        int const bx = block % blockCountX, by = (block / blockCountX) % blockCountY, c = block / (blockCountX * blockCountY);
898
        Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);
899
        if (header.intent < IntentBayerRGGB || header.intent > IntentBayerGeneric)
900
          decode(auxImage, streamBlock[block], int(bx * bsDown), int(by * bsDown),
901
          int(std::min((bx + 1) * bsDown, int64_t(sizexDown))), int(std::min((by + 1) * bsDown, int64_t(sizeyDown))),
902
          stepDown, header.encoder, isChroma[c] ? chromaQuality : header.quality, hasDC && !bx && !by, isChroma[c] != 0);
903
        else for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)
904
          decode(auxImage, streamBlock[block], int(bx * bsDown + ox), int(by * bsDown + oy),
905
          int(std::min((bx + 1) * bsDown, int64_t(sizexDown))), int(std::min((by + 1) * bsDown, int64_t(sizeyDown))),
906
          2 * stepDown, header.encoder, (ox || oy) ? chromaQuality : header.quality, hasDC && !bx && !by, ox || oy);
907
      }
908
      for (int block = 0; block < blockCount; ++ block) // check if any blocks ran out of buffer, which should not happen on valid files
909
        if (streamBlock[block].indexBits < 0)
910
          return ErrorMalformed;
911
      step /= 2;
912
    }
913
    if (test)
914
      return isTruncated ? nextPointOfInterest : ResultOk;  // return next point of interest if the data was truncated prior to completing request
915
    for (int c = 0; c < header.layers * header.channels; ++ c)  // dequantize and unlift the channels
916
    {
917
      Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);
918
      if (header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGeneric)
919
      {
920
        for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)
921
          quantize<aux, true>(auxImage, ox, oy, sizexDown, sizeyDown, 2,
922
            ((ox | oy) ? chromaQuality : header.quality) << downsampling, header.quality, QualityMax * boost);
923
        for (int ox = 0; ox <= 1; ++ ox) for (int oy = 1 - ox; oy <= 1; ++ oy)
924
          unlift(auxImage, ox, oy, sizexDown, sizeyDown, 2, header.filter);
925
      }
926
      else
927
        quantize<aux, true>(auxImage, 0, 0, sizexDown, sizeyDown, 1,
928
          (isChroma[c] ? chromaQuality : header.quality) << downsampling, 0, QualityMax * boost);
929
      unlift(auxImage, 0, 0, sizexDown, sizeyDown, 1, header.filter);
930
    }
931
    for (int s = (int)transformSteps.size() - 1; s >= 0; -- s)  // run color transform program in reverse
932
    {
933
      int const * pc = &transformProgram[transformSteps[s]];
934
      int const c = *(pc ++);
935
      std::vector<aux> transformTemp(bufferSize, 0);
936
      transformTerm(pc, &transformTemp[0], &auxData[0], bufferSize, imageData, header, isChroma, boost);
937
      aux * destination = &auxData[c * bufferSize];
938
      OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
939
      for (int i = 0; i < bufferSize; ++ i)
940
        destination[i] -= transformTemp[i];
941
    }
942
    for (int c = 0; c < header.layers * header.channels; ++ c)  // copy the channels to the destination buffer
943
    {
944
      aux * destination = &auxData[c * bufferSize];
945
      auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);
946
      if (boost == 1)
947
      {
948
        OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
949
        for (int i = 0; i < bufferSize; ++ i)
950
          layer[i * header.channels] = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),
951
            std::min(static_cast<aux>(std::numeric_limits<base>::max()), static_cast<aux>(destination[i]))));
952
      }
953
      else
954
      {
955
        OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)
956
        for (int i = 0; i < bufferSize; ++ i)
957
          layer[i * header.channels] = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),
958
            std::min(static_cast<aux>(std::numeric_limits<base>::max()), static_cast<aux>(destination[i] / boost))));
959
      }
960
      if (header.quality < QualityMax && header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGBRG)  // check if Bayer cleanup is required
961
      {
962
        int const bayerNoiseThresh = ((QualityMax + header.quality / 2) / header.quality + (QualityMax + chromaQuality / 2) / chromaQuality) * 2;
963
        Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);
964
        OMP_PARALLEL_FOR(ThreadIterations)
965
        for (int y = 1; y < sizeyDown - 1; ++ y)
966
          for (int x = 1 + (y + (header.intent == IntentBayerGBRG || header.intent == IntentBayerGRBG ? 1 : 0)) % 2; x < sizexDown - 1; x += 2)
967
          {
968
            aux s = auxImage[y][x];
969
            aux sum = s * 4;
970
            int count = 4;
971
            for (int oy = -1; oy <= 1; oy += 2) for (int ox = -1; ox <= 1; ox += 2)
972
            {
973
              aux t = auxImage[y + oy][x + ox];
974
              if (abs(s - t) > bayerNoiseThresh)
975
                continue;
976
              sum += t;
977
              ++ count;
978
            }
979
            layer[(y * sizexDown + x) * header.channels]
980
              = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),
981
                std::min(static_cast<aux>(std::numeric_limits<base>::max()), aux((sum + count / 2) / (count * boost)))));
982
          }
983
      }
984
    }
985
    return isTruncated ? nextPointOfInterest : ResultOk;  // return next point of interest if the data was truncated prior to completing request
986
  }
987
}