Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | //  Good, Fast Wavelet Codec "GFWX" v1  | 
2  |  | //  ----------------------------------  | 
3  |  | //  December 1, 2015 [patched on Oct 20, 2017]  | 
4  |  | //  Author: Graham Fyffe <gfyffe@gmail.com> or <fyffe@google.com>, and Google, Inc.  | 
5  |  | //  Website: www.gfwx.org  | 
6  |  | //  Features:  | 
7  |  | //  - FAST  | 
8  |  | //  - compression ratio similar to JPEG 2000  | 
9  |  | //  - under 1000 lines of code, with no external libraries  | 
10  |  | //  - 100% lossless at max quality  | 
11  |  | //  - low quality looks interpolated instead of blocky  | 
12  |  | //  - progressive decoding with optional downsampling  | 
13  |  | //  - supports uint8_t, int8_t, uint16_t, int16_t  | 
14  |  | //  - supports 1 to 65536 interleaved channels  | 
15  |  | //  - supports 1 to 65536 non-interleaved layers  | 
16  |  | //  - optional Bayer mode to compress Bayer data more  | 
17  |  | //  - optional chroma downsampling, even in Bayer mode  | 
18  |  | //  - optional user-programmable color/channel transform  | 
19  |  | //  - optional slightly less fast mode to compress more  | 
20  |  | //  - imageData can be any class with a pointer-like interface  | 
21  |  | //  - thoroughly tested using several pictures of cats  | 
22  |  | //  | 
23  |  | //  GFWX is released under the 3-clause BSD license:  | 
24  |  | //  | 
25  |  | //  Copyright (c) 2015, University of Southern California. All rights reserved. Redistribution and use in source and binary forms,  | 
26  |  | //  with or without modification, are permitted provided that the following conditions are met:  | 
27  |  | //  | 
28  |  | //  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.  | 
29  |  | //  | 
30  |  | //  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in  | 
31  |  | //     the documentation and/or other materials provided with the distribution.  | 
32  |  | //  | 
33  |  | //  3. Neither the name of the organization nor the names of its contributors may be used to endorse or promote products derived from  | 
34  |  | //     this software without specific prior written permission.  | 
35  |  | //  | 
36  |  | //  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT  | 
37  |  | //  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE  | 
38  |  | //  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES  | 
39  |  | //  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)  | 
40  |  | //  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)  | 
41  |  | //  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
42  |  |  | 
43  |  | #pragma once  | 
44  |  | #include <algorithm>  | 
45  |  | #include <cstddef>  | 
46  |  | #include <cstdint>  | 
47  |  | #include <cstdlib>  | 
48  |  | #include <limits>  | 
49  |  | #include <type_traits>  | 
50  |  | #include <utility>  | 
51  |  | #include <vector>  | 
52  |  | #if defined(_OPENMP) && defined(_MSC_VER)  | 
53  |  | #define OMP_PARALLEL_FOR(X) __pragma(omp parallel for schedule(dynamic, X))  | 
54  |  | #elif defined(_OPENMP)  | 
55  |  | #include <omp.h>  | 
56  |  | #define STR(X) #X  | 
57  |  | #define STRINGIFY(X) STR(X)  | 
58  |  | #define TWO_ARGUMENTS(X,Y,Z) X(Y, Z)  | 
59  |  | #define OMP_PARALLEL_FOR(X) _Pragma(STRINGIFY(TWO_ARGUMENTS(omp parallel for schedule, dynamic, X)))  | 
60  |  | #else  | 
61  |  | #define OMP_PARALLEL_FOR(X)  | 
62  |  | #endif  | 
63  |  |  | 
64  |  | namespace GFWX  | 
65  |  | { | 
66  |  |   enum  | 
67  |  |   { | 
68  |  |     QualityMax = 1024,    // compress with QualityMax for 100% lossless, or less than QualityMax for lossy  | 
69  |  |     ThreadIterations = 64,  // OMP settings tuned on my machine with large images  | 
70  |  |     BitDepthAuto = 0, BlockDefault = 7, BlockMax = 30,  | 
71  |  |     FilterLinear = 0, FilterCubic = 1, QuantizationScalar = 0, EncoderTurbo = 0, EncoderFast = 1, EncoderContextual = 2,  | 
72  |  |     IntentGeneric = 0, IntentMono = 1, IntentBayerRGGB = 2, IntentBayerBGGR = 3, IntentBayerGRBG = 4, IntentBayerGBRG = 5, IntentBayerGeneric = 6,  | 
73  |  |     IntentRGB = 7, IntentRGBA = 8, IntentRGBApremult = 9, IntentBGR = 10, IntentBGRA = 11, IntentBGRApremult = 12, IntentCMYK = 13,  | 
74  |  |     ResultOk = 0, ErrorOverflow = -1, ErrorMalformed = -2, ErrorTypeMismatch = -3  | 
75  |  |   };  | 
76  |  |  | 
77  |  |   struct Header // use the empty constructor to fetch headers before decompressing, and use the parameterized constructor when compressing  | 
78  |  |   { | 
79  |  |     int sizex, sizey, layers, channels, bitDepth, quality, chromaScale, blockSize, filter, quantization, encoder, intent, version, isSigned;  | 
80  | 0  |     Header() {} | 
81  |  |     Header(int sizex, int sizey, int layers, int channels, int bitDepth,  | 
82  |  |       int quality, int chromaScale, int blockSize, int filter, int quantization, int encoder, int intent)  | 
83  | 3.04k  |       : sizex(sizex), sizey(sizey), layers(layers), channels(channels), bitDepth(bitDepth), quality(std::max(1, std::min(int(QualityMax), quality))),  | 
84  | 3.04k  |       chromaScale(std::max(1, std::min(256, chromaScale))), blockSize(std::min(30, std::max(2, blockSize))), filter(std::min(255, filter)),  | 
85  | 3.04k  |       quantization(std::min(255, quantization)), encoder(std::min(255, encoder)), intent(std::min(255, intent)) {} | 
86  |  |     size_t bufferSize() const  | 
87  | 0  |     { | 
88  | 0  |       size_t const part1 = static_cast<size_t>(sizex) * sizey;  | 
89  | 0  |       size_t const part2 = static_cast<size_t>(channels) * layers * ((bitDepth + 7) / 8);  | 
90  | 0  |       return std::log(part1) + std::log(part2) > std::log(std::numeric_limits<size_t>::max() - 1) ? 0 : part1 * part2;  | 
91  | 0  |     }  | 
92  |  |   };  | 
93  |  |  | 
94  |  |   template<typename T> struct Image // handy wrapper for 2D image data  | 
95  |  |   { | 
96  |  |     T * data;  | 
97  |  |     int sizex, sizey;  | 
98  | 3.51M  |     Image(T * data, int sizex, int sizey) : data(data), sizex(sizex), sizey(sizey) {} | 
99  | 739M  |     T * operator[] (int y) { return data + static_cast<size_t>(y) * sizex; } | 
100  |  |   };  | 
101  |  |  | 
102  |  |   struct Bits // handy wrapper for treating an array of unsigned ints as a bit stream  | 
103  |  |   { | 
104  |  |     uint32_t * buffer, * bufferEnd;  | 
105  |  |     uint32_t writeCache;  | 
106  |  |     int indexBits;  // -1 indicates buffer overflow  | 
107  | 16.1k  |     Bits(uint32_t * buffer, uint32_t * bufferEnd) : buffer(buffer), bufferEnd(bufferEnd), writeCache(0), indexBits(0) {} | 
108  |  |     uint32_t getBits(int bits)  | 
109  | 0  |     { | 
110  | 0  |       int newBits = indexBits + bits;  | 
111  | 0  |       if (buffer == bufferEnd)  | 
112  | 0  |         return indexBits = -1;  // signify overflow  | 
113  | 0  |       uint32_t x = *buffer << indexBits;  | 
114  | 0  |       if (newBits >= 32)  | 
115  | 0  |       { | 
116  | 0  |         ++ buffer;  | 
117  | 0  |         if ((newBits -= 32) > 0)  | 
118  | 0  |         { | 
119  | 0  |           if (buffer == bufferEnd)  | 
120  | 0  |             return indexBits = -1;  // signify overflow  | 
121  | 0  |           x |= *buffer >> (32 - indexBits);  | 
122  | 0  |         }  | 
123  | 0  |       }  | 
124  | 0  |       indexBits = newBits;  | 
125  | 0  |       return x >> (32 - bits);  | 
126  | 0  |     }  | 
127  |  |     void putBits(uint32_t x, int bits)  | 
128  | 58.1M  |     { | 
129  | 58.1M  |       int newBits = indexBits + bits;  | 
130  | 58.1M  |       if (buffer == bufferEnd)  | 
131  | 10.7M  |         newBits = -1;  // signify overflow  | 
132  | 47.3M  |       else if (newBits < 32)  | 
133  | 36.1M  |         (writeCache <<= bits) |= x;  | 
134  | 11.1M  |       else if (bits == 32 && newBits == 32)  | 
135  | 13.8k  |       { | 
136  | 13.8k  |         newBits = 0;  | 
137  | 13.8k  |         *(buffer ++) = x;  | 
138  | 13.8k  |       }  | 
139  | 11.1M  |       else  | 
140  | 11.1M  |       { | 
141  | 11.1M  |         newBits -= 32;  | 
142  | 11.1M  |         *(buffer ++) = (writeCache << (bits - newBits)) | (x >> newBits);  | 
143  | 11.1M  |         writeCache = x;  | 
144  | 11.1M  |       }  | 
145  | 58.1M  |       indexBits = newBits;  | 
146  | 58.1M  |     }  | 
147  |  |     uint32_t getZeros(uint32_t maxZeros)  | 
148  | 0  |     { | 
149  | 0  |       int newBits = indexBits;  | 
150  | 0  |       if (buffer == bufferEnd)  | 
151  | 0  |         return indexBits = -1;  // signify overflow  | 
152  | 0  |       uint32_t b = *buffer;  | 
153  | 0  |       uint32_t x = 0;  | 
154  | 0  |       while (true)  | 
155  | 0  |       { | 
156  | 0  |         if (newBits == 31)  | 
157  | 0  |         { | 
158  | 0  |           ++ buffer;  | 
159  | 0  |           if ((b & 1u) || (++ x == maxZeros))  | 
160  | 0  |           { | 
161  | 0  |             indexBits = 0;  | 
162  | 0  |             return x;  | 
163  | 0  |           }  | 
164  | 0  |           if (buffer == bufferEnd)  | 
165  | 0  |             return indexBits = -1;  // signify overflow  | 
166  | 0  |           b = *buffer;  | 
167  | 0  |           newBits = 0;  | 
168  | 0  |           continue;  | 
169  | 0  |         }  | 
170  | 0  |         if (((b << newBits) & (1u << 31)) || (++ x == maxZeros))  | 
171  | 0  |         { | 
172  | 0  |           indexBits = newBits + 1;  | 
173  | 0  |           return x;  | 
174  | 0  |         }  | 
175  | 0  |         ++ newBits;  | 
176  | 0  |       }  | 
177  | 0  |     }  | 
178  |  |     void flushWriteWord() // [NOTE] does not clear overflow  | 
179  | 3.50M  |     { | 
180  | 3.50M  |       putBits(0, (32 - indexBits) % 32);  | 
181  | 3.50M  |     }  | 
182  |  |     void flushReadWord()  // [NOTE] does not clear overflow  | 
183  | 0  |     { | 
184  | 0  |       if (indexBits <= 0)  | 
185  | 0  |         return;  | 
186  | 0  |       ++ buffer;  | 
187  | 0  |       indexBits = 0;  | 
188  | 0  |     }  | 
189  |  |   };  | 
190  |  |  | 
191  |  |   template<int pot> void unsignedCode(uint32_t x, Bits & stream)  // limited length power-of-two Golomb-Rice code  | 
192  | 43.9M  |   { | 
193  | 43.9M  |     uint32_t const y = x >> (pot);  | 
194  | 43.9M  |     if (y >= 12)  | 
195  | 13.2M  |     { | 
196  | 13.2M  |       stream.putBits(0, 12);  // escape to larger code  | 
197  | 13.2M  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  | 
198  | 13.2M  |     }  | 
199  | 30.6M  |     else  | 
200  | 30.6M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  | 
201  | 43.9M  |   } void GFWX::unsignedCode<2>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 725k  |   { |  193  | 725k  |     uint32_t const y = x >> (pot);  |  194  | 725k  |     if (y >= 12)  |  195  | 68.1k  |     { |  196  | 68.1k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 68.1k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 68.1k  |     }  |  199  | 656k  |     else  |  200  | 656k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 725k  |   }  |  
 void GFWX::unsignedCode<6>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 68.1k  |   { |  193  | 68.1k  |     uint32_t const y = x >> (pot);  |  194  | 68.1k  |     if (y >= 12)  |  195  | 10.7k  |     { |  196  | 10.7k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 10.7k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 10.7k  |     }  |  199  | 57.3k  |     else  |  200  | 57.3k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 68.1k  |   }  |  
 void GFWX::unsignedCode<10>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 10.7k  |   { |  193  | 10.7k  |     uint32_t const y = x >> (pot);  |  194  | 10.7k  |     if (y >= 12)  |  195  | 888  |     { |  196  | 888  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 888  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 888  |     }  |  199  | 9.90k  |     else  |  200  | 9.90k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 10.7k  |   }  |  
 void GFWX::unsignedCode<14>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 888  |   { |  193  | 888  |     uint32_t const y = x >> (pot);  |  194  | 888  |     if (y >= 12)  |  195  | 36  |     { |  196  | 36  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 36  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 36  |     }  |  199  | 852  |     else  |  200  | 852  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 888  |   }  |  
 void GFWX::unsignedCode<18>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 36  |   { |  193  | 36  |     uint32_t const y = x >> (pot);  |  194  | 36  |     if (y >= 12)  |  195  | 0  |     { |  196  | 0  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 0  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 0  |     }  |  199  | 36  |     else  |  200  | 36  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 36  |   }  |  
 Unexecuted instantiation: void GFWX::unsignedCode<22>(unsigned int, GFWX::Bits&) Unexecuted instantiation: void GFWX::unsignedCode<24>(unsigned int, GFWX::Bits&) void GFWX::unsignedCode<1>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 14.1M  |   { |  193  | 14.1M  |     uint32_t const y = x >> (pot);  |  194  | 14.1M  |     if (y >= 12)  |  195  | 5.34M  |     { |  196  | 5.34M  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 5.34M  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 5.34M  |     }  |  199  | 8.76M  |     else  |  200  | 8.76M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 14.1M  |   }  |  
 void GFWX::unsignedCode<5>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 5.34M  |   { |  193  | 5.34M  |     uint32_t const y = x >> (pot);  |  194  | 5.34M  |     if (y >= 12)  |  195  | 2.24M  |     { |  196  | 2.24M  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 2.24M  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 2.24M  |     }  |  199  | 3.09M  |     else  |  200  | 3.09M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 5.34M  |   }  |  
 void GFWX::unsignedCode<9>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 2.24M  |   { |  193  | 2.24M  |     uint32_t const y = x >> (pot);  |  194  | 2.24M  |     if (y >= 12)  |  195  | 11.4k  |     { |  196  | 11.4k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 11.4k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 11.4k  |     }  |  199  | 2.23M  |     else  |  200  | 2.23M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 2.24M  |   }  |  
 void GFWX::unsignedCode<13>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 11.4k  |   { |  193  | 11.4k  |     uint32_t const y = x >> (pot);  |  194  | 11.4k  |     if (y >= 12)  |  195  | 70  |     { |  196  | 70  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 70  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 70  |     }  |  199  | 11.3k  |     else  |  200  | 11.3k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 11.4k  |   }  |  
 void GFWX::unsignedCode<17>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 70  |   { |  193  | 70  |     uint32_t const y = x >> (pot);  |  194  | 70  |     if (y >= 12)  |  195  | 0  |     { |  196  | 0  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 0  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 0  |     }  |  199  | 70  |     else  |  200  | 70  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 70  |   }  |  
 Unexecuted instantiation: void GFWX::unsignedCode<21>(unsigned int, GFWX::Bits&) void GFWX::unsignedCode<3>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 1.05M  |   { |  193  | 1.05M  |     uint32_t const y = x >> (pot);  |  194  | 1.05M  |     if (y >= 12)  |  195  | 126k  |     { |  196  | 126k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 126k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 126k  |     }  |  199  | 926k  |     else  |  200  | 926k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 1.05M  |   }  |  
 void GFWX::unsignedCode<7>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 126k  |   { |  193  | 126k  |     uint32_t const y = x >> (pot);  |  194  | 126k  |     if (y >= 12)  |  195  | 12.0k  |     { |  196  | 12.0k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 12.0k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 12.0k  |     }  |  199  | 114k  |     else  |  200  | 114k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 126k  |   }  |  
 void GFWX::unsignedCode<11>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 12.0k  |   { |  193  | 12.0k  |     uint32_t const y = x >> (pot);  |  194  | 12.0k  |     if (y >= 12)  |  195  | 178  |     { |  196  | 178  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 178  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 178  |     }  |  199  | 11.8k  |     else  |  200  | 11.8k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 12.0k  |   }  |  
 void GFWX::unsignedCode<15>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 178  |   { |  193  | 178  |     uint32_t const y = x >> (pot);  |  194  | 178  |     if (y >= 12)  |  195  | 0  |     { |  196  | 0  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 0  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 0  |     }  |  199  | 178  |     else  |  200  | 178  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 178  |   }  |  
 Unexecuted instantiation: void GFWX::unsignedCode<19>(unsigned int, GFWX::Bits&) Unexecuted instantiation: void GFWX::unsignedCode<23>(unsigned int, GFWX::Bits&) void GFWX::unsignedCode<4>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 12.6M  |   { |  193  | 12.6M  |     uint32_t const y = x >> (pot);  |  194  | 12.6M  |     if (y >= 12)  |  195  | 4.77M  |     { |  196  | 4.77M  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 4.77M  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 4.77M  |     }  |  199  | 7.84M  |     else  |  200  | 7.84M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 12.6M  |   }  |  
 void GFWX::unsignedCode<8>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 4.77M  |   { |  193  | 4.77M  |     uint32_t const y = x >> (pot);  |  194  | 4.77M  |     if (y >= 12)  |  195  | 77.5k  |     { |  196  | 77.5k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 77.5k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 77.5k  |     }  |  199  | 4.69M  |     else  |  200  | 4.69M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 4.77M  |   }  |  
 void GFWX::unsignedCode<12>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 77.5k  |   { |  193  | 77.5k  |     uint32_t const y = x >> (pot);  |  194  | 77.5k  |     if (y >= 12)  |  195  | 97  |     { |  196  | 97  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 97  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 97  |     }  |  199  | 77.4k  |     else  |  200  | 77.4k  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 77.5k  |   }  |  
 void GFWX::unsignedCode<16>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 97  |   { |  193  | 97  |     uint32_t const y = x >> (pot);  |  194  | 97  |     if (y >= 12)  |  195  | 0  |     { |  196  | 0  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 0  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 0  |     }  |  199  | 97  |     else  |  200  | 97  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 97  |   }  |  
 Unexecuted instantiation: void GFWX::unsignedCode<20>(unsigned int, GFWX::Bits&) void GFWX::unsignedCode<0>(unsigned int, GFWX::Bits&) Line  | Count  | Source  |  192  | 2.73M  |   { |  193  | 2.73M  |     uint32_t const y = x >> (pot);  |  194  | 2.73M  |     if (y >= 12)  |  195  | 572k  |     { |  196  | 572k  |       stream.putBits(0, 12);  // escape to larger code  |  197  | 572k  |       unsignedCode<pot < 20 ? pot + 4 : 24>(x - (12 << (pot)), stream);  |  198  | 572k  |     }  |  199  | 2.15M  |     else  |  200  | 2.15M  |       stream.putBits((1 << (pot)) | (x & ~(~0u << (pot))), y + 1 + pot); // encode x / 2^pot in unary followed by x % 2^pot in binary  |  201  | 2.73M  |   }  |  
  | 
202  |  |  | 
203  |  |   template<int pot> uint32_t unsignedDecode(Bits & stream)  | 
204  |  |   { | 
205  |  |     uint32_t x = stream.getZeros(12);  | 
206  |  |     int const p = pot < 24 ? pot : 24;  // actual pot. The max 108 below is to prevent unlimited recursion in malformed files, yet admit 2^32 - 1.  | 
207  |  |     return (pot < 108 && x == 12) ? (12 << p) + unsignedDecode<pot < 108 ? pot + 4 : 108>(stream) : p ? (x << p) + stream.getBits(p) : x;  | 
208  |  |   }  | 
209  |  |  | 
210  |  |   template<int pot> void interleavedCode(int x, Bits & stream)  | 
211  | 12.2M  |   { | 
212  | 12.2M  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  | 
213  | 12.2M  |   } void GFWX::interleavedCode<1>(int, GFWX::Bits&) Line  | Count  | Source  |  211  | 8.07M  |   { |  212  | 8.07M  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  |  213  | 8.07M  |   }  |  
 void GFWX::interleavedCode<0>(int, GFWX::Bits&) Line  | Count  | Source  |  211  | 2.73M  |   { |  212  | 2.73M  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  |  213  | 2.73M  |   }  |  
 void GFWX::interleavedCode<2>(int, GFWX::Bits&) Line  | Count  | Source  |  211  | 358k  |   { |  212  | 358k  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  |  213  | 358k  |   }  |  
 void GFWX::interleavedCode<3>(int, GFWX::Bits&) Line  | Count  | Source  |  211  | 424k  |   { |  212  | 424k  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  |  213  | 424k  |   }  |  
 void GFWX::interleavedCode<4>(int, GFWX::Bits&) Line  | Count  | Source  |  211  | 632k  |   { |  212  | 632k  |     unsignedCode<pot>(x <= 0 ? -2 * x : 2 * x - 1, stream);  // interleave positive and negative values  |  213  | 632k  |   }  |  
  | 
214  |  |  | 
215  |  |   template<int pot> int interleavedDecode(Bits & stream)  | 
216  |  |   { | 
217  |  |     int const x = unsignedDecode<pot>(stream);  | 
218  |  |     return (x & 1) ? (x + 1) / 2 : -x / 2;  | 
219  |  |   }  | 
220  |  |  | 
221  |  |   template<int pot> void signedCode(int x, Bits & stream)  | 
222  | 12.1M  |   { | 
223  | 12.1M  |     unsignedCode<pot>(abs(x), stream);  | 
224  | 12.1M  |     if (x)  | 
225  | 10.6M  |       stream.putBits(x > 0 ? 1 : 0, 1);  | 
226  | 12.1M  |   } void GFWX::signedCode<2>(int, GFWX::Bits&) Line  | Count  | Source  |  222  | 312k  |   { |  223  | 312k  |     unsignedCode<pot>(abs(x), stream);  |  224  | 312k  |     if (x)  |  225  | 227k  |       stream.putBits(x > 0 ? 1 : 0, 1);  |  226  | 312k  |   }  |  
 void GFWX::signedCode<4>(int, GFWX::Bits&) Line  | Count  | Source  |  222  | 10.8M  |   { |  223  | 10.8M  |     unsignedCode<pot>(abs(x), stream);  |  224  | 10.8M  |     if (x)  |  225  | 9.77M  |       stream.putBits(x > 0 ? 1 : 0, 1);  |  226  | 10.8M  |   }  |  
 void GFWX::signedCode<1>(int, GFWX::Bits&) Line  | Count  | Source  |  222  | 304k  |   { |  223  | 304k  |     unsignedCode<pot>(abs(x), stream);  |  224  | 304k  |     if (x)  |  225  | 165k  |       stream.putBits(x > 0 ? 1 : 0, 1);  |  226  | 304k  |   }  |  
 void GFWX::signedCode<3>(int, GFWX::Bits&) Line  | Count  | Source  |  222  | 618k  |   { |  223  | 618k  |     unsignedCode<pot>(abs(x), stream);  |  224  | 618k  |     if (x)  |  225  | 460k  |       stream.putBits(x > 0 ? 1 : 0, 1);  |  226  | 618k  |   }  |  
  | 
227  |  |  | 
228  |  |   template<int pot> int signedDecode(Bits & stream)  | 
229  |  |   { | 
230  |  |     int x = unsignedDecode<pot>(stream);  | 
231  |  |     return x ? stream.getBits(1) ? x : -x : 0;  | 
232  |  |   }  | 
233  |  |  | 
234  |  |   template<typename T> T median(T a, T b, T c)  | 
235  | 152M  |   { | 
236  | 152M  |     return a < b ? c > b ? b : c < a ? a : c : c > a ? a : c < b ? b : c;  | 
237  | 152M  |   }  | 
238  |  |  | 
239  |  |   template<typename T> T roundFraction(T num, T denom)  | 
240  | 152M  |   { | 
241  | 152M  |     return num < 0 ? (num - denom / 2) / denom : (num + denom / 2) / denom;  | 
242  | 152M  |   }  | 
243  |  |  | 
244  |  |   template<typename T> T cubic(T c0, T c1, T c2, T c3)  | 
245  | 152M  |   { | 
246  | 152M  |     return median(T(roundFraction((-c0 + 9 * (c1 + c2) - c3), 16)), c1, c2);  | 
247  | 152M  |   }  | 
248  |  |  | 
249  |  |   template<typename T> void lift(Image<T> & image, int x0, int y0, int x1, int y1, int step, int filter)  | 
250  | 21.9k  |   { | 
251  | 21.9k  |     int const sizex = x1 - x0;  | 
252  | 21.9k  |     int const sizey = y1 - y0;  | 
253  | 127k  |     while (step < sizex || step < sizey)  | 
254  | 105k  |     { | 
255  | 105k  |       if (step < sizex)  // horizontal lifting  | 
256  | 51.2k  |       { | 
257  | 51.2k  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
258  | 22.6M  |         for (int y = 0; y < sizey; y += step)  | 
259  | 22.5M  |         { | 
260  | 22.5M  |           int x;  | 
261  | 22.5M  |           T * base = &image[y0 + y][x0], * base1 = base - step, * base2 = base + step, * base3 = base + step * 3;  | 
262  | 22.5M  |           if (filter == FilterCubic)  | 
263  | 13.3M  |           { | 
264  | 13.3M  |             T c0 = *base, c1 = *base, c2 = step * 2 < sizex ? base[step * 2] : *base, c3;  | 
265  | 28.9M  |             for (x = step; x < sizex - step * 3; x += step * 2, c0 = c1, c1 = c2, c2 = c3)  | 
266  | 15.6M  |               base[x] -= cubic(c0, c1, c2, c3 = base3[x]);  | 
267  | 30.3M  |             for (; x < sizex; x += step * 2, c0 = c1, c1 = c2)  | 
268  | 17.0M  |               base[x] -= cubic(c0, c1, c2, c2);  | 
269  | 13.3M  |             T g0 = base[step], g1 = base[step], g2 = step * 3 < sizex ? base[step * 3] : base[step], g3;  | 
270  | 28.2M  |             for (x = step * 2; x < sizex - step * 3; x += step * 2, g0 = g1, g1 = g2, g2 = g3)  | 
271  | 14.9M  |               base[x] += cubic(g0, g1, g2, g3 = base3[x]) / 2;  | 
272  | 19.6M  |             for (; x < sizex; x += step * 2, g0 = g1, g1 = g2)  | 
273  | 6.34M  |               base[x] += cubic(g0, g1, g2, g2) / 2;  | 
274  | 13.3M  |           }  | 
275  | 9.24M  |           else  | 
276  | 9.24M  |           { | 
277  | 36.7M  |             for (x = step; x < sizex - step; x += step * 2)  | 
278  | 27.5M  |               base[x] -= (base1[x] + base2[x]) / 2;  | 
279  | 9.24M  |             if (x < sizex)  | 
280  | 7.70M  |               base[x] -= base1[x];  | 
281  | 35.2M  |             for (x = step * 2; x < sizex - step; x += step * 2)  | 
282  | 25.9M  |               base[x] += (base1[x] + base2[x]) / 4;  | 
283  | 9.24M  |             if (x < sizex)  | 
284  | 1.54M  |               base[x] += base1[x] / 2;  | 
285  | 9.24M  |           }  | 
286  | 22.5M  |         }  | 
287  | 51.2k  |       }  | 
288  | 105k  |       if (step < sizey)  // vertical lifting  | 
289  | 74.9k  |       { | 
290  | 74.9k  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
291  | 71.0M  |         for (int y = step; y < sizey; y += step * 2)  | 
292  | 70.9M  |         { | 
293  | 70.9M  |           T * const base = &image[y0 + y][x0];  | 
294  | 70.9M  |           T const * const c1base = &image[y0 + y - step][x0], * const c2base = y + step < sizey ? &image[y0 + y + step][x0] : c1base;  | 
295  | 70.9M  |           if (filter == FilterCubic)  | 
296  | 37.9M  |           { | 
297  | 37.9M  |             T const * const c0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : c1base;  | 
298  | 37.9M  |             T const * const c3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : c2base;  | 
299  | 87.5M  |             for (int x = 0; x < sizex; x += step)  | 
300  | 49.6M  |               base[x] -= cubic(c0base[x], c1base[x], c2base[x], c3base[x]);  | 
301  | 37.9M  |           }  | 
302  | 75.8M  |           else for (int x = 0; x < sizex; x += step)  | 
303  | 42.8M  |             base[x] -= (c1base[x] + c2base[x]) / 2;  | 
304  | 70.9M  |         }  | 
305  | 74.9k  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
306  | 70.9M  |         for (int y = step * 2; y < sizey; y += step * 2)  | 
307  | 70.9M  |         { | 
308  | 70.9M  |           T * const base = &image[y0 + y][x0];  | 
309  | 70.9M  |           T const * const g1base = &image[y0 + y - step][x0], * const g2base = y + step < sizey ? &image[y0 + y + step][x0] : g1base;  | 
310  | 70.9M  |           if (filter == FilterCubic)  | 
311  | 37.8M  |           { | 
312  | 37.8M  |             T const * const g0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : g1base;  | 
313  | 37.8M  |             T const * const g3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : g2base;  | 
314  | 86.9M  |             for (int x = 0; x < sizex; x += step)  | 
315  | 49.0M  |               base[x] += cubic(g0base[x], g1base[x], g2base[x], g3base[x]) / 2;  | 
316  | 37.8M  |           }  | 
317  | 74.7M  |           else for (int x = 0; x < sizex; x += step)  | 
318  | 41.7M  |             base[x] += (g1base[x] + g2base[x]) / 4;  | 
319  | 70.9M  |         }  | 
320  | 74.9k  |       }  | 
321  | 105k  |       step *= 2;  | 
322  | 105k  |     }  | 
323  | 21.9k  |   }  | 
324  |  |  | 
325  |  |   template<typename T> void unlift(Image<T> & image, int x0, int y0, int x1, int y1, int minStep, int filter)  | 
326  |  |   { | 
327  |  |     int const sizex = x1 - x0;  | 
328  |  |     int const sizey = y1 - y0;  | 
329  |  |     int step = minStep;  | 
330  |  |     while (step * 2 < sizex || step * 2 < sizey)  | 
331  |  |       step *= 2;  | 
332  |  |     while (step >= minStep)  | 
333  |  |     { | 
334  |  |       if (step < sizey) // vertical unlifting  | 
335  |  |       { | 
336  |  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
337  |  |         for (int y = step * 2; y < sizey; y += step * 2)  | 
338  |  |         { | 
339  |  |           T * const base = &image[y0 + y][x0];  | 
340  |  |           T const * const g1base = &image[y0 + y - step][x0], * const g2base = y + step < sizey ? &image[y0 + y + step][x0] : g1base;  | 
341  |  |           if (filter == FilterCubic)  | 
342  |  |           { | 
343  |  |             T const * const g0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : g1base;  | 
344  |  |             T const * const g3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : g2base;  | 
345  |  |             for (int x = 0; x < sizex; x += step)  | 
346  |  |               base[x] -= cubic(g0base[x], g1base[x], g2base[x], g3base[x]) / 2;  | 
347  |  |           }  | 
348  |  |           else for (int x = 0; x < sizex; x += step)  | 
349  |  |             base[x] -= (g1base[x] + g2base[x]) / 4;  | 
350  |  |         }  | 
351  |  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
352  |  |         for (int y = step; y < sizey; y += step * 2)  | 
353  |  |         { | 
354  |  |           T * const base = &image[y0 + y][x0];  | 
355  |  |           T const * const c1base = &image[y0 + y - step][x0], * const c2base = y + step < sizey ? &image[y0 + y + step][x0] : c1base;  | 
356  |  |           if (filter == FilterCubic)  | 
357  |  |           { | 
358  |  |             T const * const c0base = y - step * 3 >= 0 ? &image[y0 + y - step * 3][x0] : c1base;  | 
359  |  |             T const * const c3base = y + step * 3 < sizey ? &image[y0 + y + step * 3][x0] : c2base;  | 
360  |  |             for (int x = 0; x < sizex; x += step)  | 
361  |  |               base[x] += cubic(c0base[x], c1base[x], c2base[x], c3base[x]);  | 
362  |  |           }  | 
363  |  |           else for (int x = 0; x < sizex; x += step)  | 
364  |  |             base[x] += (c1base[x] + c2base[x]) / 2;  | 
365  |  |         }  | 
366  |  |       }  | 
367  |  |       if (step < sizex) // horizontal unlifting  | 
368  |  |       { | 
369  |  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
370  |  |         for (int y = 0; y < sizey; y += step)  | 
371  |  |         { | 
372  |  |           int x;  | 
373  |  |           T * base = &image[y0 + y][x0], * base1 = base - step, * base2 = base + step, * base3 = base + step * 3;  | 
374  |  |           if (filter == FilterCubic)  | 
375  |  |           { | 
376  |  |             T g0 = base[step], g1 = base[step], g2 = step * 3 < sizex ? base[step * 3] : base[step], g3;  | 
377  |  |             for (x = step * 2; x < sizex - step * 3; x += step * 2, g0 = g1, g1 = g2, g2 = g3)  | 
378  |  |               base[x] -= cubic(g0, g1, g2, g3 = base3[x]) / 2;  | 
379  |  |             for (; x < sizex; x += step * 2, g0 = g1, g1 = g2)  | 
380  |  |               base[x] -= cubic(g0, g1, g2, g2) / 2;  | 
381  |  |             T c0 = *base, c1 = *base, c2 = step * 2 < sizex ? base[step * 2] : *base, c3;  | 
382  |  |             for (x = step; x < sizex - step * 3; x += step * 2, c0 = c1, c1 = c2, c2 = c3)  | 
383  |  |               base[x] += cubic(c0, c1, c2, c3 = base3[x]);  | 
384  |  |             for (; x < sizex; x += step * 2, c0 = c1, c1 = c2)  | 
385  |  |               base[x] += cubic(c0, c1, c2, c2);  | 
386  |  |           }  | 
387  |  |           else  | 
388  |  |           { | 
389  |  |             for (x = step * 2; x < sizex - step; x += step * 2)  | 
390  |  |               base[x] -= (base1[x] + base2[x]) / 4;  | 
391  |  |             if (x < sizex)  | 
392  |  |               base[x] -= base1[x] / 2;  | 
393  |  |             for (x = step; x < sizex - step; x += step * 2)  | 
394  |  |               base[x] += (base1[x] + base2[x]) / 2;  | 
395  |  |             if (x < sizex)  | 
396  |  |               base[x] += base1[x];  | 
397  |  |           }  | 
398  |  |         }  | 
399  |  |       }  | 
400  |  |       step /= 2;  | 
401  |  |     }  | 
402  |  |   }  | 
403  |  |  | 
404  |  |   template<typename T, bool dequantize> void quantize(Image<T> & image, int x0, int y0, int x1, int y1, int step, int quality, int minQ, int maxQ)  | 
405  | 21.9k  |   { | 
406  | 21.9k  |     typedef typename std::conditional<sizeof(T) < 4, int32_t, int64_t>::type aux;  | 
407  | 21.9k  |     int const sizex = x1 - x0;  | 
408  | 21.9k  |     int const sizey = y1 - y0;  | 
409  | 21.9k  |     int skip = step;  | 
410  | 38.7k  |     while (skip < sizex && skip < sizey)  | 
411  | 17.8k  |     { | 
412  | 17.8k  |       int const q = std::max(std::max(1, minQ), quality);  | 
413  | 17.8k  |       if (q >= maxQ) break;  | 
414  | 16.7k  |       OMP_PARALLEL_FOR(ThreadIterations)  | 
415  | 21.4M  |       for (int y = 0; y < sizey; y += skip)  | 
416  | 21.4M  |       { | 
417  | 21.4M  |         T * base = &image[y0 + y][x0];  | 
418  | 21.4M  |         int const xStep = (y & skip) ? skip : skip * 2;  | 
419  | 89.2M  |         for (int x = xStep - skip; x < sizex; x += xStep)  // [NOTE] arranged so that (x | y) & skip == 1  | 
420  | 67.7M  |           base[x] = dequantize ? (aux(base[x]) * maxQ + (base[x] < 0 ? -maxQ / 2 : base[x] > 0 ? maxQ / 2 : 0)) / q : aux(base[x]) * q / maxQ;  | 
421  | 21.4M  |       }  | 
422  | 16.7k  |       skip *= 2;  | 
423  | 16.7k  |       quality = std::min(maxQ, quality * 2);  // [MAGIC] This approximates the JPEG 2000 baseline quantizer  | 
424  | 16.7k  |     }  | 
425  | 21.9k  |   }  | 
426  |  |  | 
427  |  |   template<typename T> T square(T t)  | 
428  | 48.4M  |   { | 
429  | 48.4M  |     return t * t;  | 
430  | 48.4M  |   }  | 
431  |  |  | 
432  |  |   inline void addContext(int x, int w, uint32_t & sum, uint32_t & sum2, uint32_t & count)  | 
433  | 26.9M  |   { | 
434  | 26.9M  |     sum += uint32_t(x = abs(x)) * w;  | 
435  | 26.9M  |     sum2 += square(std::min(uint32_t(x), 4096u)) * w; // [MAGIC] avoid overflow in last line of getContext  | 
436  | 26.9M  |     count += w;  | 
437  | 26.9M  |   }  | 
438  |  |  | 
439  |  |   template<typename T> std::pair<uint32_t, uint32_t> getContext(Image<T> & image, int x0, int y0, int x1, int y1, int x, int y, int skip)  | 
440  | 12.2M  |   { | 
441  | 12.2M  |     int px = x0 + (x & ~(skip * 2)) + (x & skip);  | 
442  | 12.2M  |     if (px >= x1)  | 
443  | 927k  |       px -= skip * 2;  | 
444  | 12.2M  |     int py = y0 + (y & ~(skip * 2)) + (y & skip);  | 
445  | 12.2M  |     if (py >= y1)  | 
446  | 681k  |       py -= skip * 2;  | 
447  | 12.2M  |     uint32_t count = 0, sum = 0, sum2 = 0;  | 
448  | 12.2M  |     addContext(abs(image[py][px]), 2, sum, sum2, count);  // ancestor  | 
449  | 12.2M  |     if ((y & skip) && (x | skip) < x1 - x0)  | 
450  | 2.96M  |     { | 
451  | 2.96M  |       addContext(image[y0 + y - skip][x0 + (x | skip)], 2, sum, sum2, count); // upper sibling  | 
452  | 2.96M  |       if (x & skip)  | 
453  | 1.47M  |         addContext(image[y0 + y][x0 + x - skip], 2, sum, sum2, count); // left sibling  | 
454  | 2.96M  |     }  | 
455  | 12.2M  |     if (y >= skip * 2 && x >= skip * 2)  // neighbors  | 
456  | 1.69M  |     { | 
457  | 1.69M  |       addContext(image[y0 + y - skip * 2][x0 + x], 4, sum, sum2, count);  | 
458  | 1.69M  |       addContext(image[y0 + y][x0 + x - skip * 2], 4, sum, sum2, count);  | 
459  | 1.69M  |       addContext(image[y0 + y - skip * 2][x0 + x - skip * 2], 2, sum, sum2, count);  | 
460  | 1.69M  |       if (x + skip * 2 < x1 - x0)  | 
461  | 1.48M  |         addContext(image[y0 + y - skip * 2][x0 + x + skip * 2], 2, sum, sum2, count);  | 
462  | 1.69M  |       if (y >= skip * 4 && x >= skip * 4)  | 
463  | 983k  |       { | 
464  | 983k  |         addContext(image[y0 + y - skip * 4][x0 + x], 2, sum, sum2, count);  | 
465  | 983k  |         addContext(image[y0 + y][x0 + x - skip * 4], 2, sum, sum2, count);  | 
466  | 983k  |         addContext(image[y0 + y - skip * 4][x0 + x - skip * 4], 1, sum, sum2, count);  | 
467  | 983k  |         if (x + skip * 4 < x1 - x0)  | 
468  | 794k  |           addContext(image[y0 + y - skip * 4][x0 + x + skip * 4], 1, sum, sum2, count);  | 
469  | 983k  |       }  | 
470  | 1.69M  |     }  | 
471  | 12.2M  |     return std::make_pair((sum * 16u + count / 2u) / count, (sum2 * 16u + count / 2u) / count); // set sums relative to 16 count  | 
472  | 12.2M  |   }  | 
473  |  |  | 
474  |  |   template<typename T> void encode(Image<T> & image, Bits & stream, int x0, int y0, int x1, int y1, int step, int scheme, int q, bool hasDC, bool isChroma)  | 
475  | 8.32M  |   { | 
476  | 8.32M  |     int const sizex = x1 - x0;  | 
477  | 8.32M  |     int const sizey = y1 - y0;  | 
478  | 8.32M  |     if (hasDC && sizex > 0 && sizey > 0)  | 
479  | 15.1k  |       signedCode<4>(image[y0][x0], stream);  | 
480  | 8.32M  |     std::pair<uint32_t, uint32_t> context(0, 0);  | 
481  | 8.32M  |     int run = 0, runCoder = (scheme == EncoderTurbo ? (!q || (step < 2048 && q * step < 2048)) ? 1 : 0 : 0);  // avoid overflow checking q * step < 2048  | 
482  | 99.7M  |     for (int y = 0; y < sizey; y += step)  | 
483  | 91.4M  |     { | 
484  | 91.4M  |       T * base = &image[y0 + y][x0];  | 
485  | 91.4M  |       int const xStep = (y & step) ? step : step * 2;  | 
486  | 185M  |       for (int x = xStep - step; x < sizex; x += xStep)  // [NOTE] arranged so that (x | y) & step == 1  | 
487  | 94.3M  |       { | 
488  | 94.3M  |         T s = base[x];  | 
489  | 94.3M  |         if (runCoder && !s)  // run  | 
490  | 70.0M  |           ++ run;  | 
491  | 24.3M  |         else  | 
492  | 24.3M  |         { | 
493  | 24.3M  |           if (scheme == EncoderTurbo)  | 
494  | 7.48M  |           { | 
495  | 7.48M  |             if (runCoder)  // break the run  | 
496  | 4.39M  |             { | 
497  | 4.39M  |               unsignedCode<1>(run, stream);  | 
498  | 4.39M  |               run = 0;  | 
499  | 4.39M  |               interleavedCode<1>(s < 0 ? s + 1 : s, stream);  // s can't be zero, so shift negatives by 1  | 
500  | 4.39M  |             }  | 
501  | 3.08M  |             else  | 
502  | 3.08M  |               interleavedCode<1>(s, stream);  | 
503  | 7.48M  |             continue;  | 
504  | 7.48M  |           }  | 
505  | 16.8M  |           if (runCoder)  // break the run  | 
506  | 380k  |           { | 
507  | 380k  |             runCoder == 1 ? unsignedCode<1>(run, stream) : runCoder == 2 ? unsignedCode<2>(run, stream)  | 
508  | 282k  |               : runCoder == 3 ? unsignedCode<3>(run, stream) : unsignedCode<4>(run, stream);  | 
509  | 380k  |             run = 0;  | 
510  | 380k  |             if (s < 0)  | 
511  | 191k  |               ++ s; // s can't be zero, so shift negatives by 1  | 
512  | 380k  |           }  | 
513  | 16.8M  |           if (scheme == EncoderContextual)  | 
514  | 12.2M  |             context = getContext(image, x0, y0, x1, y1, x, y, step);  | 
515  | 16.8M  |           uint32_t const sumSq = square(context.first);  | 
516  | 16.8M  |           if (sumSq < 2u * context.second + (isChroma ? 250u : 100u))  | 
517  | 2.73M  |             interleavedCode<0>(s, stream);  | 
518  | 14.1M  |           else if (sumSq < 2u * context.second + 950u)  | 
519  | 590k  |             interleavedCode<1>(s, stream);  | 
520  | 13.5M  |           else if (sumSq < 3u * context.second + 3000u)  | 
521  | 663k  |           { | 
522  | 663k  |             if (sumSq < 5u * context.second + 400u)  | 
523  | 304k  |               signedCode<1>(s, stream);  | 
524  | 358k  |             else  | 
525  | 358k  |               interleavedCode<2>(s, stream);  | 
526  | 663k  |           }  | 
527  | 12.8M  |           else if (sumSq < 3u * context.second + 12000u)  | 
528  | 716k  |           { | 
529  | 716k  |             if (sumSq < 5u * context.second + 3000u)  | 
530  | 292k  |               signedCode<2>(s, stream);  | 
531  | 424k  |             else  | 
532  | 424k  |               interleavedCode<3>(s, stream);  | 
533  | 716k  |           }  | 
534  | 12.1M  |           else if (sumSq < 4u * context.second + 44000u)  | 
535  | 1.25M  |           { | 
536  | 1.25M  |             if (sumSq < 6u * context.second + 12000u)  | 
537  | 618k  |               signedCode<3>(s, stream);  | 
538  | 632k  |             else  | 
539  | 632k  |               interleavedCode<4>(s, stream);  | 
540  | 1.25M  |           }  | 
541  | 10.8M  |           else  | 
542  | 10.8M  |             signedCode<4>(s, stream);  | 
543  | 16.8M  |           if (scheme == EncoderFast)  // use decaying first and second moment  | 
544  | 4.61M  |           { | 
545  | 4.61M  |             uint32_t const t = abs(s);  | 
546  | 4.61M  |             context = std::make_pair(((context.first * 15u + 7u) >> 4) + t, ((context.second * 15u + 7u) >> 4) + square(std::min(t, 4096u)));  | 
547  | 4.61M  |             if (!s == !runCoder)  | 
548  | 1.99M  |               runCoder = context.first < 1 ? 4 : context.first < 2 ? 3 : context.first < 4 ? 2 : context.first < 8 ? 1 : 0;  | 
549  | 4.61M  |           }  | 
550  | 12.2M  |           else if (!s == !runCoder)  | 
551  | 2.17M  |             runCoder = q == 1024 ? context.first < 2u ? 1 : 0 : (context.first < 4u && context.second < 2u) ? 4 : (context.first < 8u  | 
552  | 1.64M  |             && context.second < 4u) ? 3 : (2u * sumSq < 3u * context.second + 48u) ? 2 : (2u * sumSq < 5u * context.second + 32u) ? 1 : 0;  | 
553  | 16.8M  |         }  | 
554  | 94.3M  |       }  | 
555  | 91.4M  |     }  | 
556  | 8.32M  |     if (run)  // flush run  | 
557  | 1.53M  |       runCoder == 1 ? unsignedCode<1>(run, stream) : runCoder == 2 ? unsignedCode<2>(run, stream)  | 
558  | 300k  |         : runCoder == 3 ? unsignedCode<3>(run, stream) : unsignedCode<4>(run, stream);  | 
559  | 8.32M  |   }  | 
560  |  |  | 
561  |  |   template<typename T> void decode(Image<T> & image, Bits & stream, int x0, int y0, int x1, int y1, int step, int scheme, int q, bool hasDC, bool isChroma)  | 
562  |  |   { | 
563  |  |     int const sizex = x1 - x0;  | 
564  |  |     int const sizey = y1 - y0;  | 
565  |  |     if (hasDC && sizex > 0 && sizey > 0)  | 
566  |  |       image[y0][x0] = signedDecode<4>(stream);  | 
567  |  |     std::pair<uint32_t, uint32_t> context(0, 0);  | 
568  |  |     int run = -1, runCoder = (scheme == EncoderTurbo ? (!q || (step < 2048 && q * step < 2048)) ? 1 : 0 : 0);  // avoid overflow checking q * step < 2048  | 
569  |  |     for (int y = 0; y < sizey; y += step)  | 
570  |  |     { | 
571  |  |       T * base = &image[y0 + y][x0];  | 
572  |  |       int const xStep = (y & step) ? step : step * 2;  | 
573  |  |       for (int x = xStep - step; x < sizex; x += xStep) // [NOTE] arranged so that (x | y) & step == 1  | 
574  |  |       { | 
575  |  |         T s = 0;  | 
576  |  |         if (runCoder && run == -1)  | 
577  |  |           run = runCoder == 1 ? unsignedDecode<1>(stream) : runCoder == 2 ? unsignedDecode<2>(stream)  | 
578  |  |             : runCoder == 3 ? unsignedDecode<3>(stream) : unsignedDecode<4>(stream);  | 
579  |  |         if (run > 0)  | 
580  |  |           -- run; // consume a zero  | 
581  |  |         else  | 
582  |  |         { | 
583  |  |           if (scheme == EncoderTurbo)  | 
584  |  |             s = interleavedDecode<1>(stream);  | 
585  |  |           else  | 
586  |  |           { | 
587  |  |             if (scheme == EncoderContextual)  | 
588  |  |               context = getContext(image, x0, y0, x1, y1, x, y, step);  | 
589  |  |             uint32_t const sumSq = square(context.first);  | 
590  |  |             if (sumSq < 2u * context.second + (isChroma ? 250u : 100u))  | 
591  |  |               s = interleavedDecode<0>(stream);  | 
592  |  |             else if (sumSq < 2u * context.second + 950u)  | 
593  |  |               s = interleavedDecode<1>(stream);  | 
594  |  |             else if (sumSq < 3u * context.second + 3000u)  | 
595  |  |             { | 
596  |  |               if (sumSq < 5u * context.second + 400u)  | 
597  |  |                 s = signedDecode<1>(stream);  | 
598  |  |               else  | 
599  |  |                 s = interleavedDecode<2>(stream);  | 
600  |  |             }  | 
601  |  |             else if (sumSq < 3u * context.second + 12000u)  | 
602  |  |             { | 
603  |  |               if (sumSq < 5u * context.second + 3000u)  | 
604  |  |                 s = signedDecode<2>(stream);  | 
605  |  |               else  | 
606  |  |                 s = interleavedDecode<3>(stream);  | 
607  |  |             }  | 
608  |  |             else if (sumSq < 4u * context.second + 44000u)  | 
609  |  |             { | 
610  |  |               if (sumSq < 6u * context.second + 12000u)  | 
611  |  |                 s = signedDecode<3>(stream);  | 
612  |  |               else  | 
613  |  |                 s = interleavedDecode<4>(stream);  | 
614  |  |             }  | 
615  |  |             else  | 
616  |  |               s = signedDecode<4>(stream);  | 
617  |  |             if (scheme == EncoderFast)  // use decaying first and second moment  | 
618  |  |             { | 
619  |  |               uint32_t const t = abs(s);  | 
620  |  |               context = std::make_pair(((context.first * 15u + 7u) >> 4) + t, ((context.second * 15u + 7u) >> 4) + square(std::min(t, 4096u)));  | 
621  |  |               if (!s == !runCoder)  | 
622  |  |                 runCoder = context.first < 1 ? 4 : context.first < 2 ? 3 : context.first < 4 ? 2 : context.first < 8 ? 1 : 0;  | 
623  |  |             }  | 
624  |  |             else if (!s == !runCoder)  | 
625  |  |               runCoder = q == 1024 ? context.first < 2u ? 1 : 0 : (context.first < 4u && context.second < 2u) ? 4 : (context.first < 8u  | 
626  |  |               && context.second < 4u) ? 3 : (2u * sumSq < 3u * context.second + 48u) ? 2 : (2u * sumSq < 5u * context.second + 32u) ? 1 : 0;  | 
627  |  |           }  | 
628  |  |           if (run == 0 && s <= 0)  | 
629  |  |             -- s; // s can't be zero, so shift negatives by 1  | 
630  |  |           run = -1;  | 
631  |  |         }  | 
632  |  |         base[x] = s;  | 
633  |  |       }  | 
634  |  |     }  | 
635  |  |   }  | 
636  |  |  | 
637  |  |   template<typename T> void shiftVector(T * data, int shift, int count)  | 
638  | 1.05k  |   { | 
639  | 1.05k  |     OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
640  | 34.1M  |     for (int i = 0; i < count; ++ i)  | 
641  | 34.1M  |       data[i] >>= shift;  | 
642  | 1.05k  |   }  | 
643  |  |  | 
644  |  |   template<typename I, typename A> void transformTerm(int const * & pc, A * destination, A const * auxData, int const bufferSize,  | 
645  |  |     I const & imageData, Header const & header, std::vector<int> const & isChroma, int boost)  | 
646  | 2.41k  |   { | 
647  | 5.88k  |     while (*pc >= 0)  | 
648  | 3.47k  |     { | 
649  | 3.47k  |       int const c = *(pc ++);  | 
650  | 3.47k  |       A const factor = *(pc ++);  | 
651  | 3.47k  |       if (isChroma[c] == -1)  | 
652  | 1.61k  |       { | 
653  | 1.61k  |         auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);  | 
654  | 1.61k  |         A const boostFactor = boost * factor;  | 
655  | 1.61k  |         OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
656  | 41.1M  |         for (int i = 0; i < bufferSize; ++ i)  | 
657  | 41.1M  |           destination[i] += layer[i * header.channels] * boostFactor;  | 
658  | 1.61k  |       }  | 
659  | 1.85k  |       else  | 
660  | 1.85k  |       { | 
661  | 1.85k  |         A const * auxDataC = auxData + c * bufferSize;  | 
662  | 1.85k  |         OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
663  | 54.7M  |         for (int i = 0; i < bufferSize; ++ i)  | 
664  | 54.7M  |           destination[i] += auxDataC[i] * factor;  | 
665  | 1.85k  |       }  | 
666  | 3.47k  |     }  | 
667  | 2.41k  |     A const denom = *((++ pc) ++);  | 
668  | 2.41k  |     if (denom == 2)  | 
669  | 246  |       shiftVector(destination, 1, bufferSize);  | 
670  | 2.17k  |     else if (denom == 4)  | 
671  | 560  |       shiftVector(destination, 2, bufferSize);  | 
672  | 1.61k  |     else if (denom == 8)  | 
673  | 246  |       shiftVector(destination, 3, bufferSize);  | 
674  | 1.36k  |     else if (denom > 1)  // [NOTE] disallow non-positive denominators  | 
675  | 0  |     { | 
676  | 0  |       OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
677  | 0  |       for (int i = 0; i < bufferSize; ++ i)  | 
678  | 0  |         destination[i] /= denom;  | 
679  | 0  |     }  | 
680  | 2.41k  |   }  | 
681  |  |  | 
682  |  |   // GFWX_TRANSFORM_UYV implements YUV (actually UYV) as R -= G (chroma); B -= G (chroma); G += (R + B) / 4 (luma)  | 
683  | 3.25k  |   #define GFWX_TRANSFORM_UYV { 0, 1, -1, -1, 1, 1, 2, 1, -1, -1, 1, 1, 1, 0, 1, 2, 1, -1, 4, 0, -1 } | 
684  |  |   // GFWX_TRANSFORM_A710 implements A710 as R -= G (chroma); B -= (G * 2 + R) / 2 (chroma); G += (B * 2 + R * 3) / 8 (luma)  | 
685  | 3.25k  |   #define GFWX_TRANSFORM_A710_BGR { 2, 1, -1, -1, 1, 1, 0, 1, -2, 2, -1, -1, 2, 1, 1, 0, 2, 2, 3, -1, 8, 0, -1 } | 
686  | 3.25k  |   #define GFWX_TRANSFORM_A710_RGB { 0, 1, -1, -1, 1, 1, 2, 1, -2, 0, -1, -1, 2, 1, 1, 2, 2, 0, 3, -1, 8, 0, -1 } | 
687  |  |  | 
688  |  |   template<typename I> ptrdiff_t compress(I const & imageData, Header & header, uint8_t * buffer, size_t size,  | 
689  |  |     int const * channelTransform, uint8_t * metaData, size_t metaDataSize)  | 
690  | 3.02k  |   { | 
691  | 3.02k  |     typedef typename std::remove_reference<decltype(imageData[0])>::type base;  | 
692  | 3.02k  |     typedef typename std::conditional<sizeof(base) < 2, int16_t, int32_t>::type aux;  | 
693  | 3.02k  |     if (header.sizex > (1 << 30) || header.sizey > (1 << 30))  // [NOTE] current implementation can't go over 2^30  | 
694  | 0  |       return ErrorMalformed;  | 
695  | 3.02k  |     Bits stream(reinterpret_cast<uint32_t *>(buffer), reinterpret_cast<uint32_t *>(buffer) + size / 4);  | 
696  | 3.02k  |     stream.putBits('G' | ('F' << 8) | ('W' << 16) | ('X' << 24), 32); | 
697  | 3.02k  |     stream.putBits(header.version = 1, 32);  | 
698  | 3.02k  |     stream.putBits(header.sizex, 32);  | 
699  | 3.02k  |     stream.putBits(header.sizey, 32);  | 
700  | 3.02k  |     stream.putBits(header.layers - 1, 16);  | 
701  | 3.02k  |     stream.putBits(header.channels - 1, 16);  | 
702  | 3.02k  |     stream.putBits((header.bitDepth ? header.bitDepth : (header.bitDepth = std::numeric_limits<base>::digits)) - 1, 8);  | 
703  | 3.02k  |     stream.putBits(header.isSigned = std::numeric_limits<base>::is_signed ? 1 : 0, 1);  | 
704  | 3.02k  |     stream.putBits(header.quality - 1, 10);  | 
705  | 3.02k  |     stream.putBits(header.chromaScale - 1, 8);  | 
706  | 3.02k  |     stream.putBits(header.blockSize - 2, 5);  | 
707  | 3.02k  |     stream.putBits(header.filter, 8);  | 
708  | 3.02k  |     stream.putBits(header.quantization, 8);  | 
709  | 3.02k  |     stream.putBits(header.encoder, 8);  | 
710  | 3.02k  |     stream.putBits(header.intent, 8);  | 
711  | 3.02k  |     stream.putBits(int(metaDataSize / 4), 32);  | 
712  | 3.02k  |     stream.buffer = std::copy(reinterpret_cast<uint32_t *>(metaData), reinterpret_cast<uint32_t *>(metaData) + metaDataSize / 4, stream.buffer);  | 
713  | 3.02k  |     int const bufferSize = header.sizex * header.sizey;  | 
714  | 3.02k  |     std::vector<aux> auxData((size_t)header.layers * header.channels * bufferSize, 0);  | 
715  | 3.02k  |     std::vector<int> isChroma(header.layers * header.channels, -1);  | 
716  | 3.02k  |     int const chromaQuality = std::max(1, (header.quality + header.chromaScale / 2) / header.chromaScale);  | 
717  | 3.02k  |     int const boost = header.quality == QualityMax ? 1 : 8; // [NOTE] due to Cubic lifting max multiplier of 20, boost * 20 must be less than 256  | 
718  | 3.02k  |     if (channelTransform)  // run color transform program (and also encode it to the file)  | 
719  | 806  |     { | 
720  | 806  |       int const * pc = channelTransform;  | 
721  | 3.22k  |       while (*pc >= 0)  | 
722  | 2.41k  |       { | 
723  | 2.41k  |         int const c = *(pc ++);  | 
724  | 2.41k  |         aux * destination = &auxData[c * bufferSize];  | 
725  | 2.41k  |         transformTerm(pc, destination, &auxData[0], bufferSize, imageData, header, isChroma, boost);  | 
726  | 2.41k  |         auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);  | 
727  | 2.41k  |         OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
728  | 61.7M  |         for (int i = 0; i < bufferSize; ++ i)  | 
729  | 61.7M  |           destination[i] += layer[i * header.channels] * boost;  | 
730  | 2.41k  |         isChroma[c] = *(pc ++);  | 
731  | 2.41k  |       }  | 
732  | 18.2k  |       for (int const * i = channelTransform; i <= pc; ++ i)  | 
733  | 17.4k  |         signedCode<2>(*i, stream);  | 
734  | 806  |     }  | 
735  | 2.21k  |     else  | 
736  | 2.21k  |       signedCode<2>(-1, stream);  | 
737  | 3.02k  |     stream.flushWriteWord();  | 
738  | 12.1k  |     for (int c = 0; c < header.layers * header.channels; ++ c) if (isChroma[c] == -1)  // copy channels having no transform  | 
739  | 6.65k  |     { | 
740  | 6.65k  |       aux * destination = &auxData[c * bufferSize];  | 
741  | 6.65k  |       auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);  | 
742  | 6.65k  |       OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
743  | 60.3M  |       for (int i = 0; i < bufferSize; ++ i)  | 
744  | 60.3M  |         destination[i] = layer[i * header.channels] * boost;  | 
745  | 6.65k  |       isChroma[c] = 0;  | 
746  | 6.65k  |     }  | 
747  | 12.1k  |     for (int c = 0; c < header.layers * header.channels; ++ c)  // lift and quantize the channels  | 
748  | 9.07k  |     { | 
749  | 9.07k  |       Image<aux> auxImage(&auxData[c * bufferSize], header.sizex, header.sizey);  | 
750  | 9.07k  |       lift(auxImage, 0, 0, header.sizex, header.sizey, 1, header.filter);  | 
751  | 9.07k  |       if (header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGeneric)  | 
752  | 4.30k  |       { | 
753  | 21.5k  |         for (int ox = 0; ox <= 1; ++ ox) for (int oy = 1 - ox; oy <= 1; ++ oy)  | 
754  | 12.9k  |           lift(auxImage, ox, oy, header.sizex, header.sizey, 2, header.filter);  | 
755  | 25.8k  |         for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)  | 
756  | 17.2k  |           quantize<aux, false>(auxImage, ox, oy, header.sizex, header.sizey, 2,  | 
757  | 17.2k  |             (ox | oy) ? chromaQuality : header.quality, header.quality, QualityMax * boost);  | 
758  | 4.30k  |       }  | 
759  | 4.76k  |       else  | 
760  | 4.76k  |         quantize<aux, false>(auxImage, 0, 0, header.sizex, header.sizey, 1, isChroma[c] ? chromaQuality : header.quality, 0, QualityMax * boost);  | 
761  | 9.07k  |     }  | 
762  | 3.02k  |     int step = 1;  | 
763  | 16.8k  |     while (step * 2 < header.sizex || step * 2 < header.sizey)  | 
764  | 13.7k  |       step *= 2;  | 
765  | 13.4k  |     for (bool hasDC = true; step >= 1; hasDC = false)  | 
766  | 13.1k  |     { | 
767  | 13.1k  |       int64_t const bs = int64_t(step) << header.blockSize;  | 
768  | 13.1k  |       int const blockCountX = (header.sizex + bs - 1) / bs;  | 
769  | 13.1k  |       int const blockCountY = (header.sizey + bs - 1) / bs;  | 
770  | 13.1k  |       int const blockCount = blockCountX * blockCountY * header.layers * header.channels;  | 
771  | 13.1k  |       std::vector<Bits> streamBlock(blockCount, Bits(0, 0));  | 
772  | 13.1k  |       uint32_t * blockBegin = stream.buffer + blockCount; // leave space for block sizes  | 
773  | 13.1k  |       if (blockBegin >= stream.bufferEnd)  | 
774  | 687  |         return ErrorOverflow;  | 
775  | 3.51M  |       for (int block = 0; block < blockCount; ++ block)  // partition buffer into temporary regions for each block  | 
776  | 3.50M  |         streamBlock[block].buffer = blockBegin + (stream.bufferEnd - blockBegin) * block / blockCount;  | 
777  | 3.51M  |       for (int block = 0; block < blockCount; ++ block)  | 
778  | 3.50M  |         streamBlock[block].bufferEnd = block + 1 < blockCount ? streamBlock[block + 1].buffer : stream.bufferEnd;  | 
779  | 12.4k  |       OMP_PARALLEL_FOR(4) // [MAGIC] for some reason, 4 is by far the best option here  | 
780  | 3.51M  |       for (int block = 0; block < blockCount; ++ block)  | 
781  | 3.50M  |       { | 
782  | 3.50M  |         int const bx = block % blockCountX, by = (block / blockCountX) % blockCountY, c = block / (blockCountX * blockCountY);  | 
783  | 3.50M  |         Image<aux> auxImage(&auxData[c * bufferSize], header.sizex, header.sizey);  | 
784  | 3.50M  |         if (header.intent < IntentBayerRGGB || header.intent > IntentBayerGeneric)  | 
785  | 1.90M  |           encode(auxImage, streamBlock[block], bx * bs, by * bs,  | 
786  | 1.90M  |           int(std::min((bx + 1) * bs, int64_t(header.sizex))), int(std::min((by + 1) * bs, int64_t(header.sizey))),  | 
787  | 1.90M  |           step, header.encoder, isChroma[c] ? chromaQuality : header.quality, hasDC && !bx && !by, isChroma[c] != 0);  | 
788  | 9.63M  |         else for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)  | 
789  | 6.42M  |           encode(auxImage, streamBlock[block], bx * bs + ox, by * bs + oy,  | 
790  | 6.42M  |           int(std::min((bx + 1) * bs, int64_t(header.sizex))), int(std::min((by + 1) * bs, int64_t(header.sizey))),  | 
791  | 6.42M  |           2 * step, header.encoder, (ox || oy) ? chromaQuality : header.quality, hasDC && !bx && !by, ox || oy);  | 
792  | 3.50M  |         streamBlock[block].flushWriteWord();  | 
793  | 3.50M  |       }  | 
794  | 2.17M  |       for (int block = 0; block < blockCount; ++ block)  // check streamBlocks for overflow  | 
795  | 2.16M  |         if (streamBlock[block].indexBits < 0)  | 
796  | 2.05k  |           return ErrorOverflow;  | 
797  | 2.15M  |       for (int block = 0; block < blockCount; ++ block)  // encode block lengths [NOTE] this 32-bit encoding limits the file size to < 16 GB  | 
798  | 2.14M  |         *(stream.buffer ++) = uint32_t(streamBlock[block].buffer - (block ? streamBlock[block - 1].bufferEnd : blockBegin));  | 
799  | 2.15M  |       for (int block = 0; block < blockCount; ++ block)  // pack the streamBlock data tightly, by word [NOTE] first block is already packed  | 
800  | 2.14M  |         stream.buffer = block ? std::copy(streamBlock[block - 1].bufferEnd, streamBlock[block].buffer, stream.buffer) : streamBlock[0].buffer;  | 
801  | 10.4k  |       step /= 2;  | 
802  | 10.4k  |     }  | 
803  | 284  |     return reinterpret_cast<uint8_t *>(stream.buffer) - buffer; // return size in bytes  | 
804  | 3.02k  |   }  | 
805  |  |  | 
806  |  |   template<typename I> ptrdiff_t decompress(I const & imageData, Header & header, uint8_t const * data, size_t size, int downsampling, bool test)  | 
807  |  |   { | 
808  |  |     typedef typename std::remove_reference<decltype(imageData[0])>::type base;  | 
809  |  |     typedef typename std::conditional<sizeof(base) < 2, int16_t, int32_t>::type aux;  | 
810  |  |     Bits stream(reinterpret_cast<uint32_t *>(const_cast<uint8_t *>(data)), reinterpret_cast<uint32_t *>(const_cast<uint8_t *>(data)) + size / 4);  | 
811  |  |     if (size < 28)  // at least load the header  | 
812  |  |       return 28;  | 
813  |  |     if (stream.getBits(32) != uint32_t('G' | ('F' << 8) | ('W' << 16) | ('X' << 24))) | 
814  |  |       return ErrorMalformed;  | 
815  |  |     header.version = stream.getBits(32);  | 
816  |  |     header.sizex = stream.getBits(32);  | 
817  |  |     header.sizey = stream.getBits(32);  | 
818  |  |     header.layers = stream.getBits(16) + 1;  | 
819  |  |     header.channels = stream.getBits(16) + 1;  | 
820  |  |     header.bitDepth = stream.getBits(8) + 1;  | 
821  |  |     header.isSigned = stream.getBits(1);  | 
822  |  |     header.quality = stream.getBits(10) + 1;  | 
823  |  |     header.chromaScale = stream.getBits(8) + 1;  | 
824  |  |     header.blockSize = stream.getBits(5) + 2;  | 
825  |  |     header.filter = stream.getBits(8);  | 
826  |  |     header.quantization = stream.getBits(8);  | 
827  |  |     header.encoder = stream.getBits(8);  | 
828  |  |     header.intent = stream.getBits(8);  | 
829  |  |     if (header.sizex < 0 || header.sizex > (1 << 30) || header.sizey < 0 || header.sizey > (1 << 30) || header.bufferSize() == 0)  | 
830  |  |       return ErrorMalformed;  // [NOTE] current implementation can't go over 2^30  | 
831  |  |     if (!imageData)   // just header  | 
832  |  |       return ResultOk;  | 
833  |  |     if (header.isSigned != (std::numeric_limits<base>::is_signed ? 1 : 0) || header.bitDepth > std::numeric_limits<base>::digits)  | 
834  |  |       return ErrorTypeMismatch; // check for correct buffer type (though doesn't test the buffer size)  | 
835  |  |     // [NOTE] clients can read metadata themselves by accessing the size (in words) at word[7] and the metadata at word[8+]  | 
836  |  |     if ((stream.buffer += stream.getBits(32)) >= stream.bufferEnd) // skip metadata  | 
837  |  |       return reinterpret_cast<uint8_t *>(stream.buffer) - data; // suggest point of interest to skip metadata  | 
838  |  |     int const sizexDown = (header.sizex + (1 << downsampling) - 1) >> downsampling, sizeyDown = (header.sizey + (1 << downsampling) - 1) >> downsampling;  | 
839  |  |     int const bufferSize = sizexDown * sizeyDown;  | 
840  |  |     std::vector<aux> auxData((size_t)header.layers * header.channels * bufferSize, 0);  | 
841  |  |     std::vector<int> isChroma(header.layers * header.channels, 0), transformProgram, transformSteps;  | 
842  |  |     size_t nextPointOfInterest = size + 1024; // guess next point of interest [NOTE] may be larger than the complete file  | 
843  |  |     while (true)  // decode color transform program (including isChroma flags)  | 
844  |  |     { | 
845  |  |       transformProgram.push_back(signedDecode<2>(stream));  // channel  | 
846  |  |       if (transformProgram.back() >= static_cast<int>(isChroma.size()))  | 
847  |  |           return ErrorMalformed;  | 
848  |  |       if (transformProgram.back() < 0)  | 
849  |  |         break;  | 
850  |  |       transformSteps.push_back(int(transformProgram.size()) - 1);  | 
851  |  |       while (true)  | 
852  |  |       { | 
853  |  |         if (stream.indexBits < 0) // test for truncation  | 
854  |  |           return nextPointOfInterest; // need more data  | 
855  |  |         transformProgram.push_back(signedDecode<2>(stream));  // other channel  | 
856  |  |         if (transformProgram.back() >= static_cast<int>(isChroma.size()))  | 
857  |  |             return ErrorMalformed;  | 
858  |  |         if (transformProgram.back() < 0)  | 
859  |  |           break;  | 
860  |  |         transformProgram.push_back(signedDecode<2>(stream));  // factor  | 
861  |  |       }  | 
862  |  |       transformProgram.push_back(signedDecode<2>(stream));  // denominator  | 
863  |  |       transformProgram.push_back(signedDecode<2>(stream));  // chroma flag  | 
864  |  |       isChroma[transformProgram[transformSteps.back()]] = transformProgram.back();  | 
865  |  |     }  | 
866  |  |     stream.flushReadWord();  | 
867  |  |     int const chromaQuality = std::max(1, (header.quality + header.chromaScale / 2) / header.chromaScale);  | 
868  |  |     int const boost = header.quality == QualityMax ? 1 : 8; // [NOTE] due to Cubic lifting max multiplier of 20, boost * 20 must be less than 256  | 
869  |  |     bool isTruncated = false;  | 
870  |  |     int step = 1;  | 
871  |  |     while (step * 2 < header.sizex || step * 2 < header.sizey)  | 
872  |  |       step *= 2;  | 
873  |  |     for (bool hasDC = true; (step >> downsampling) >= 1; hasDC = false) // decode just enough coefficients for downsampled image  | 
874  |  |     { | 
875  |  |       int64_t const bs = int64_t(step) << header.blockSize;  | 
876  |  |       int const blockCountX = int((header.sizex + bs - 1) / bs);  | 
877  |  |       int const blockCountY = int((header.sizey + bs - 1) / bs);  | 
878  |  |       int const blockCount = blockCountX * blockCountY * header.layers * header.channels;  | 
879  |  |       isTruncated = true;  | 
880  |  |       if (stream.buffer + 1 + blockCount > stream.bufferEnd)  // check for enough buffer to read block sizes  | 
881  |  |         break;  | 
882  |  |       std::vector<Bits> streamBlock(blockCount, Bits(0, 0));  | 
883  |  |       for (int block = 0; block < blockCount; ++ block) // first, read sizes into bufferEnd pointers  | 
884  |  |         streamBlock[block].bufferEnd = static_cast<uint32_t *>(0) + *(stream.buffer ++);  | 
885  |  |       for (int block = 0; block < blockCount; ++ block) // then convert sizes to true buffer pointers  | 
886  |  |         streamBlock[block].bufferEnd = (streamBlock[block].buffer = block ? streamBlock[block - 1].bufferEnd : stream.buffer)  | 
887  |  |                         + (streamBlock[block].bufferEnd - static_cast<uint32_t *>(0));  | 
888  |  |       stream.buffer = streamBlock[blockCount - 1].bufferEnd;  | 
889  |  |       nextPointOfInterest = reinterpret_cast<uint8_t *>(stream.buffer + ((step >> downsampling) > 1 ? blockCount * 4 : 0)) - data;  | 
890  |  |       if (stream.buffer <= stream.bufferEnd)  | 
891  |  |         isTruncated = false;  | 
892  |  |       int const stepDown = step >> downsampling;  | 
893  |  |       int64_t const bsDown = int64_t(stepDown) << header.blockSize;  | 
894  |  |       OMP_PARALLEL_FOR(4) // [MAGIC] for some reason, 4 is by far the best option here  | 
895  |  |       for (int block = 0; block < blockCount; ++ block) if (!test && streamBlock[block].bufferEnd <= stream.bufferEnd)  | 
896  |  |       { | 
897  |  |         int const bx = block % blockCountX, by = (block / blockCountX) % blockCountY, c = block / (blockCountX * blockCountY);  | 
898  |  |         Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);  | 
899  |  |         if (header.intent < IntentBayerRGGB || header.intent > IntentBayerGeneric)  | 
900  |  |           decode(auxImage, streamBlock[block], int(bx * bsDown), int(by * bsDown),  | 
901  |  |           int(std::min((bx + 1) * bsDown, int64_t(sizexDown))), int(std::min((by + 1) * bsDown, int64_t(sizeyDown))),  | 
902  |  |           stepDown, header.encoder, isChroma[c] ? chromaQuality : header.quality, hasDC && !bx && !by, isChroma[c] != 0);  | 
903  |  |         else for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)  | 
904  |  |           decode(auxImage, streamBlock[block], int(bx * bsDown + ox), int(by * bsDown + oy),  | 
905  |  |           int(std::min((bx + 1) * bsDown, int64_t(sizexDown))), int(std::min((by + 1) * bsDown, int64_t(sizeyDown))),  | 
906  |  |           2 * stepDown, header.encoder, (ox || oy) ? chromaQuality : header.quality, hasDC && !bx && !by, ox || oy);  | 
907  |  |       }  | 
908  |  |       for (int block = 0; block < blockCount; ++ block) // check if any blocks ran out of buffer, which should not happen on valid files  | 
909  |  |         if (streamBlock[block].indexBits < 0)  | 
910  |  |           return ErrorMalformed;  | 
911  |  |       step /= 2;  | 
912  |  |     }  | 
913  |  |     if (test)  | 
914  |  |       return isTruncated ? nextPointOfInterest : ResultOk;  // return next point of interest if the data was truncated prior to completing request  | 
915  |  |     for (int c = 0; c < header.layers * header.channels; ++ c)  // dequantize and unlift the channels  | 
916  |  |     { | 
917  |  |       Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);  | 
918  |  |       if (header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGeneric)  | 
919  |  |       { | 
920  |  |         for (int ox = 0; ox <= 1; ++ ox) for (int oy = 0; oy <= 1; ++ oy)  | 
921  |  |           quantize<aux, true>(auxImage, ox, oy, sizexDown, sizeyDown, 2,  | 
922  |  |             ((ox | oy) ? chromaQuality : header.quality) << downsampling, header.quality, QualityMax * boost);  | 
923  |  |         for (int ox = 0; ox <= 1; ++ ox) for (int oy = 1 - ox; oy <= 1; ++ oy)  | 
924  |  |           unlift(auxImage, ox, oy, sizexDown, sizeyDown, 2, header.filter);  | 
925  |  |       }  | 
926  |  |       else  | 
927  |  |         quantize<aux, true>(auxImage, 0, 0, sizexDown, sizeyDown, 1,  | 
928  |  |           (isChroma[c] ? chromaQuality : header.quality) << downsampling, 0, QualityMax * boost);  | 
929  |  |       unlift(auxImage, 0, 0, sizexDown, sizeyDown, 1, header.filter);  | 
930  |  |     }  | 
931  |  |     for (int s = (int)transformSteps.size() - 1; s >= 0; -- s)  // run color transform program in reverse  | 
932  |  |     { | 
933  |  |       int const * pc = &transformProgram[transformSteps[s]];  | 
934  |  |       int const c = *(pc ++);  | 
935  |  |       std::vector<aux> transformTemp(bufferSize, 0);  | 
936  |  |       transformTerm(pc, &transformTemp[0], &auxData[0], bufferSize, imageData, header, isChroma, boost);  | 
937  |  |       aux * destination = &auxData[c * bufferSize];  | 
938  |  |       OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
939  |  |       for (int i = 0; i < bufferSize; ++ i)  | 
940  |  |         destination[i] -= transformTemp[i];  | 
941  |  |     }  | 
942  |  |     for (int c = 0; c < header.layers * header.channels; ++ c)  // copy the channels to the destination buffer  | 
943  |  |     { | 
944  |  |       aux * destination = &auxData[c * bufferSize];  | 
945  |  |       auto layer = imageData + ((c / header.channels) * bufferSize * header.channels + c % header.channels);  | 
946  |  |       if (boost == 1)  | 
947  |  |       { | 
948  |  |         OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
949  |  |         for (int i = 0; i < bufferSize; ++ i)  | 
950  |  |           layer[i * header.channels] = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),  | 
951  |  |             std::min(static_cast<aux>(std::numeric_limits<base>::max()), static_cast<aux>(destination[i]))));  | 
952  |  |       }  | 
953  |  |       else  | 
954  |  |       { | 
955  |  |         OMP_PARALLEL_FOR(ThreadIterations * ThreadIterations)  | 
956  |  |         for (int i = 0; i < bufferSize; ++ i)  | 
957  |  |           layer[i * header.channels] = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),  | 
958  |  |             std::min(static_cast<aux>(std::numeric_limits<base>::max()), static_cast<aux>(destination[i] / boost))));  | 
959  |  |       }  | 
960  |  |       if (header.quality < QualityMax && header.intent >= IntentBayerRGGB && header.intent <= IntentBayerGBRG)  // check if Bayer cleanup is required  | 
961  |  |       { | 
962  |  |         int const bayerNoiseThresh = ((QualityMax + header.quality / 2) / header.quality + (QualityMax + chromaQuality / 2) / chromaQuality) * 2;  | 
963  |  |         Image<aux> auxImage(&auxData[c * bufferSize], sizexDown, sizeyDown);  | 
964  |  |         OMP_PARALLEL_FOR(ThreadIterations)  | 
965  |  |         for (int y = 1; y < sizeyDown - 1; ++ y)  | 
966  |  |           for (int x = 1 + (y + (header.intent == IntentBayerGBRG || header.intent == IntentBayerGRBG ? 1 : 0)) % 2; x < sizexDown - 1; x += 2)  | 
967  |  |           { | 
968  |  |             aux s = auxImage[y][x];  | 
969  |  |             aux sum = s * 4;  | 
970  |  |             int count = 4;  | 
971  |  |             for (int oy = -1; oy <= 1; oy += 2) for (int ox = -1; ox <= 1; ox += 2)  | 
972  |  |             { | 
973  |  |               aux t = auxImage[y + oy][x + ox];  | 
974  |  |               if (abs(s - t) > bayerNoiseThresh)  | 
975  |  |                 continue;  | 
976  |  |               sum += t;  | 
977  |  |               ++ count;  | 
978  |  |             }  | 
979  |  |             layer[(y * sizexDown + x) * header.channels]  | 
980  |  |               = static_cast<base>(std::max(static_cast<aux>(std::numeric_limits<base>::lowest()),  | 
981  |  |                 std::min(static_cast<aux>(std::numeric_limits<base>::max()), aux((sum + count / 2) / (count * boost)))));  | 
982  |  |           }  | 
983  |  |       }  | 
984  |  |     }  | 
985  |  |     return isTruncated ? nextPointOfInterest : ResultOk;  // return next point of interest if the data was truncated prior to completing request  | 
986  |  |   }  | 
987  |  | }  |