/src/libjpeg-turbo.main/jchuff.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * jchuff.c  | 
3  |  |  *  | 
4  |  |  * This file was part of the Independent JPEG Group's software:  | 
5  |  |  * Copyright (C) 1991-1997, Thomas G. Lane.  | 
6  |  |  * Lossless JPEG Modifications:  | 
7  |  |  * Copyright (C) 1999, Ken Murchison.  | 
8  |  |  * libjpeg-turbo Modifications:  | 
9  |  |  * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.  | 
10  |  |  * Copyright (C) 2015, Matthieu Darbois.  | 
11  |  |  * Copyright (C) 2018, Matthias Räncker.  | 
12  |  |  * Copyright (C) 2020, Arm Limited.  | 
13  |  |  * Copyright (C) 2022, Felix Hanau.  | 
14  |  |  * For conditions of distribution and use, see the accompanying README.ijg  | 
15  |  |  * file.  | 
16  |  |  *  | 
17  |  |  * This file contains Huffman entropy encoding routines.  | 
18  |  |  *  | 
19  |  |  * Much of the complexity here has to do with supporting output suspension.  | 
20  |  |  * If the data destination module demands suspension, we want to be able to  | 
21  |  |  * back up to the start of the current MCU.  To do this, we copy state  | 
22  |  |  * variables into local working storage, and update them back to the  | 
23  |  |  * permanent JPEG objects only upon successful completion of an MCU.  | 
24  |  |  *  | 
25  |  |  * NOTE: All referenced figures are from  | 
26  |  |  * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.  | 
27  |  |  */  | 
28  |  |  | 
29  |  | #define JPEG_INTERNALS  | 
30  |  | #include "jinclude.h"  | 
31  |  | #include "jpeglib.h"  | 
32  |  | #ifdef WITH_SIMD  | 
33  |  | #include "jsimd.h"  | 
34  |  | #else  | 
35  |  | #include "jchuff.h"             /* Declarations shared with jc*huff.c */  | 
36  |  | #endif  | 
37  |  | #include <limits.h>  | 
38  |  |  | 
39  |  | /*  | 
40  |  |  * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be  | 
41  |  |  * used for bit counting rather than the lookup table.  This will reduce the  | 
42  |  |  * memory footprint by 64k, which is important for some mobile applications  | 
43  |  |  * that create many isolated instances of libjpeg-turbo (web browsers, for  | 
44  |  |  * instance.)  This may improve performance on some mobile platforms as well.  | 
45  |  |  * This feature is enabled by default only on Arm processors, because some x86  | 
46  |  |  * chips have a slow implementation of bsr, and the use of clz/bsr cannot be  | 
47  |  |  * shown to have a significant performance impact even on the x86 chips that  | 
48  |  |  * have a fast implementation of it.  When building for Armv6, you can  | 
49  |  |  * explicitly disable the use of clz/bsr by adding -mthumb to the compiler  | 
50  |  |  * flags (this defines __thumb__).  | 
51  |  |  */  | 
52  |  |  | 
53  |  | /* NOTE: Both GCC and Clang define __GNUC__ */  | 
54  |  | #if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \  | 
55  |  |     defined(_M_ARM) || defined(_M_ARM64)  | 
56  |  | #if !defined(__thumb__) || defined(__thumb2__)  | 
57  |  | #define USE_CLZ_INTRINSIC  | 
58  |  | #endif  | 
59  |  | #endif  | 
60  |  |  | 
61  |  | #ifdef USE_CLZ_INTRINSIC  | 
62  |  | #if defined(_MSC_VER) && !defined(__clang__)  | 
63  |  | #define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))  | 
64  |  | #else  | 
65  |  | #define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))  | 
66  |  | #endif  | 
67  |  | #define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)  | 
68  |  | #else  | 
69  |  | #include "jpeg_nbits_table.h"  | 
70  | 0  | #define JPEG_NBITS(x)          (jpeg_nbits_table[x])  | 
71  | 0  | #define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)  | 
72  |  | #endif  | 
73  |  |  | 
74  |  |  | 
75  |  | /* Expanded entropy encoder object for Huffman encoding.  | 
76  |  |  *  | 
77  |  |  * The savable_state subrecord contains fields that change within an MCU,  | 
78  |  |  * but must not be updated permanently until we complete the MCU.  | 
79  |  |  */  | 
80  |  |  | 
81  |  | #if defined(__x86_64__) && defined(__ILP32__)  | 
82  |  | typedef unsigned long long bit_buf_type;  | 
83  |  | #else  | 
84  |  | typedef size_t bit_buf_type;  | 
85  |  | #endif  | 
86  |  |  | 
87  |  | /* NOTE: The more optimal Huffman encoding algorithm is only used by the  | 
88  |  |  * intrinsics implementation of the Arm Neon SIMD extensions, which is why we  | 
89  |  |  * retain the old Huffman encoder behavior when using the GAS implementation.  | 
90  |  |  */  | 
91  |  | #if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \  | 
92  |  |                             defined(_M_ARM) || defined(_M_ARM64))  | 
93  |  | typedef unsigned long long simd_bit_buf_type;  | 
94  |  | #else  | 
95  |  | typedef bit_buf_type simd_bit_buf_type;  | 
96  |  | #endif  | 
97  |  |  | 
98  |  | #if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \  | 
99  |  |     (defined(__x86_64__) && defined(__ILP32__))  | 
100  | 0  | #define BIT_BUF_SIZE  64  | 
101  |  | #elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)  | 
102  |  | #define BIT_BUF_SIZE  32  | 
103  |  | #else  | 
104  |  | #error Cannot determine word size  | 
105  |  | #endif  | 
106  | 29.3k  | #define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)  | 
107  |  |  | 
108  |  | typedef struct { | 
109  |  |   union { | 
110  |  |     bit_buf_type c;  | 
111  |  |     simd_bit_buf_type simd;  | 
112  |  |   } put_buffer;                         /* current bit accumulation buffer */  | 
113  |  |   int free_bits;                        /* # of bits available in it */  | 
114  |  |                                         /* (Neon GAS: # of bits now in it) */  | 
115  |  |   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */  | 
116  |  | } savable_state;  | 
117  |  |  | 
118  |  | typedef struct { | 
119  |  |   struct jpeg_entropy_encoder pub; /* public fields */  | 
120  |  |  | 
121  |  |   savable_state saved;          /* Bit buffer & DC state at start of MCU */  | 
122  |  |  | 
123  |  |   /* These fields are NOT loaded into local working state. */  | 
124  |  |   unsigned int restarts_to_go;  /* MCUs left in this restart interval */  | 
125  |  |   int next_restart_num;         /* next restart number to write (0-7) */  | 
126  |  |  | 
127  |  |   /* Pointers to derived tables (these workspaces have image lifespan) */  | 
128  |  |   c_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];  | 
129  |  |   c_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];  | 
130  |  |  | 
131  |  | #ifdef ENTROPY_OPT_SUPPORTED    /* Statistics tables for optimization */  | 
132  |  |   long *dc_count_ptrs[NUM_HUFF_TBLS];  | 
133  |  |   long *ac_count_ptrs[NUM_HUFF_TBLS];  | 
134  |  | #endif  | 
135  |  |  | 
136  |  |   int simd;  | 
137  |  | } huff_entropy_encoder;  | 
138  |  |  | 
139  |  | typedef huff_entropy_encoder *huff_entropy_ptr;  | 
140  |  |  | 
141  |  | /* Working state while writing an MCU.  | 
142  |  |  * This struct contains all the fields that are needed by subroutines.  | 
143  |  |  */  | 
144  |  |  | 
145  |  | typedef struct { | 
146  |  |   JOCTET *next_output_byte;     /* => next byte to write in buffer */  | 
147  |  |   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */  | 
148  |  |   savable_state cur;            /* Current bit buffer & DC state */  | 
149  |  |   j_compress_ptr cinfo;         /* dump_buffer needs access to this */  | 
150  |  |   int simd;  | 
151  |  | } working_state;  | 
152  |  |  | 
153  |  |  | 
154  |  | /* Forward declarations */  | 
155  |  | METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);  | 
156  |  | METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);  | 
157  |  | #ifdef ENTROPY_OPT_SUPPORTED  | 
158  |  | METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,  | 
159  |  |                                      JBLOCKROW *MCU_data);  | 
160  |  | METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);  | 
161  |  | #endif  | 
162  |  |  | 
163  |  |  | 
164  |  | /*  | 
165  |  |  * Initialize for a Huffman-compressed scan.  | 
166  |  |  * If gather_statistics is TRUE, we do not output anything during the scan,  | 
167  |  |  * just count the Huffman symbols used and generate Huffman code tables.  | 
168  |  |  */  | 
169  |  |  | 
170  |  | METHODDEF(void)  | 
171  |  | start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)  | 
172  | 14.6k  | { | 
173  | 14.6k  |   huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;  | 
174  | 14.6k  |   int ci, dctbl, actbl;  | 
175  | 14.6k  |   jpeg_component_info *compptr;  | 
176  |  |  | 
177  | 14.6k  |   if (gather_statistics) { | 
178  | 7.33k  | #ifdef ENTROPY_OPT_SUPPORTED  | 
179  | 7.33k  |     entropy->pub.encode_mcu = encode_mcu_gather;  | 
180  | 7.33k  |     entropy->pub.finish_pass = finish_pass_gather;  | 
181  |  | #else  | 
182  |  |     ERREXIT(cinfo, JERR_NOT_COMPILED);  | 
183  |  | #endif  | 
184  | 7.33k  |   } else { | 
185  | 7.33k  |     entropy->pub.encode_mcu = encode_mcu_huff;  | 
186  | 7.33k  |     entropy->pub.finish_pass = finish_pass_huff;  | 
187  | 7.33k  |   }  | 
188  |  |  | 
189  | 14.6k  | #ifdef WITH_SIMD  | 
190  | 14.6k  |   entropy->simd = jsimd_can_huff_encode_one_block();  | 
191  | 14.6k  | #endif  | 
192  |  |  | 
193  | 48.4k  |   for (ci = 0; ci < cinfo->comps_in_scan; ci++) { | 
194  | 33.8k  |     compptr = cinfo->cur_comp_info[ci];  | 
195  | 33.8k  |     dctbl = compptr->dc_tbl_no;  | 
196  | 33.8k  |     actbl = compptr->ac_tbl_no;  | 
197  | 33.8k  |     if (gather_statistics) { | 
198  | 16.9k  | #ifdef ENTROPY_OPT_SUPPORTED  | 
199  |  |       /* Check for invalid table indexes */  | 
200  |  |       /* (make_c_derived_tbl does this in the other path) */  | 
201  | 16.9k  |       if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)  | 
202  | 0  |         ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);  | 
203  | 16.9k  |       if (actbl < 0 || actbl >= NUM_HUFF_TBLS)  | 
204  | 0  |         ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);  | 
205  |  |       /* Allocate and zero the statistics tables */  | 
206  |  |       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */  | 
207  | 16.9k  |       if (entropy->dc_count_ptrs[dctbl] == NULL)  | 
208  | 11.1k  |         entropy->dc_count_ptrs[dctbl] = (long *)  | 
209  | 11.1k  |           (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,  | 
210  | 11.1k  |                                       257 * sizeof(long));  | 
211  | 16.9k  |       memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));  | 
212  | 16.9k  |       if (entropy->ac_count_ptrs[actbl] == NULL)  | 
213  | 11.1k  |         entropy->ac_count_ptrs[actbl] = (long *)  | 
214  | 11.1k  |           (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,  | 
215  | 11.1k  |                                       257 * sizeof(long));  | 
216  | 16.9k  |       memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));  | 
217  | 16.9k  | #endif  | 
218  | 16.9k  |     } else { | 
219  |  |       /* Compute derived values for Huffman tables */  | 
220  |  |       /* We may do this more than once for a table, but it's not expensive */  | 
221  | 16.9k  |       jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,  | 
222  | 16.9k  |                               &entropy->dc_derived_tbls[dctbl]);  | 
223  | 16.9k  |       jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,  | 
224  | 16.9k  |                               &entropy->ac_derived_tbls[actbl]);  | 
225  | 16.9k  |     }  | 
226  |  |     /* Initialize DC predictions to 0 */  | 
227  | 33.8k  |     entropy->saved.last_dc_val[ci] = 0;  | 
228  | 33.8k  |   }  | 
229  |  |  | 
230  |  |   /* Initialize bit buffer to empty */  | 
231  | 14.6k  | #ifdef WITH_SIMD  | 
232  | 14.6k  |   if (entropy->simd) { | 
233  | 14.6k  |     entropy->saved.put_buffer.simd = 0;  | 
234  |  | #if defined(__aarch64__) && !defined(NEON_INTRINSICS)  | 
235  |  |     entropy->saved.free_bits = 0;  | 
236  |  | #else  | 
237  | 14.6k  |     entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;  | 
238  | 14.6k  | #endif  | 
239  | 14.6k  |   } else  | 
240  | 0  | #endif  | 
241  | 0  |   { | 
242  | 0  |     entropy->saved.put_buffer.c = 0;  | 
243  | 0  |     entropy->saved.free_bits = BIT_BUF_SIZE;  | 
244  | 0  |   }  | 
245  |  |  | 
246  |  |   /* Initialize restart stuff */  | 
247  | 14.6k  |   entropy->restarts_to_go = cinfo->restart_interval;  | 
248  | 14.6k  |   entropy->next_restart_num = 0;  | 
249  | 14.6k  | }  | 
250  |  |  | 
251  |  |  | 
252  |  | /*  | 
253  |  |  * Compute the derived values for a Huffman table.  | 
254  |  |  * This routine also performs some validation checks on the table.  | 
255  |  |  *  | 
256  |  |  * Note this is also used by jcphuff.c and jclhuff.c.  | 
257  |  |  */  | 
258  |  |  | 
259  |  | GLOBAL(void)  | 
260  |  | jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,  | 
261  |  |                         c_derived_tbl **pdtbl)  | 
262  | 54.9k  | { | 
263  | 54.9k  |   JHUFF_TBL *htbl;  | 
264  | 54.9k  |   c_derived_tbl *dtbl;  | 
265  | 54.9k  |   int p, i, l, lastp, si, maxsymbol;  | 
266  | 54.9k  |   char huffsize[257];  | 
267  | 54.9k  |   unsigned int huffcode[257];  | 
268  | 54.9k  |   unsigned int code;  | 
269  |  |  | 
270  |  |   /* Note that huffsize[] and huffcode[] are filled in code-length order,  | 
271  |  |    * paralleling the order of the symbols themselves in htbl->huffval[].  | 
272  |  |    */  | 
273  |  |  | 
274  |  |   /* Find the input Huffman table */  | 
275  | 54.9k  |   if (tblno < 0 || tblno >= NUM_HUFF_TBLS)  | 
276  | 0  |     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);  | 
277  | 54.9k  |   htbl =  | 
278  | 54.9k  |     isDC ? cinfo->dc_huff_tbl_ptrs[tblno] : cinfo->ac_huff_tbl_ptrs[tblno];  | 
279  | 54.9k  |   if (htbl == NULL)  | 
280  | 0  |     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);  | 
281  |  |  | 
282  |  |   /* Allocate a workspace if we haven't already done so. */  | 
283  | 54.9k  |   if (*pdtbl == NULL)  | 
284  | 26.1k  |     *pdtbl = (c_derived_tbl *)  | 
285  | 26.1k  |       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,  | 
286  | 26.1k  |                                   sizeof(c_derived_tbl));  | 
287  | 54.9k  |   dtbl = *pdtbl;  | 
288  |  |  | 
289  |  |   /* Figure C.1: make table of Huffman code length for each symbol */  | 
290  |  |  | 
291  | 54.9k  |   p = 0;  | 
292  | 933k  |   for (l = 1; l <= 16; l++) { | 
293  | 878k  |     i = (int)htbl->bits[l];  | 
294  | 878k  |     if (i < 0 || p + i > 256)   /* protect against table overrun */  | 
295  | 0  |       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);  | 
296  | 1.34M  |     while (i--)  | 
297  | 468k  |       huffsize[p++] = (char)l;  | 
298  | 878k  |   }  | 
299  | 54.9k  |   huffsize[p] = 0;  | 
300  | 54.9k  |   lastp = p;  | 
301  |  |  | 
302  |  |   /* Figure C.2: generate the codes themselves */  | 
303  |  |   /* We also validate that the counts represent a legal Huffman code tree. */  | 
304  |  |  | 
305  | 54.9k  |   code = 0;  | 
306  | 54.9k  |   si = huffsize[0];  | 
307  | 54.9k  |   p = 0;  | 
308  | 246k  |   while (huffsize[p]) { | 
309  | 660k  |     while (((int)huffsize[p]) == si) { | 
310  | 468k  |       huffcode[p++] = code;  | 
311  | 468k  |       code++;  | 
312  | 468k  |     }  | 
313  |  |     /* code is now 1 more than the last code used for codelength si; but  | 
314  |  |      * it must still fit in si bits, since no code is allowed to be all ones.  | 
315  |  |      */  | 
316  | 191k  |     if (((JLONG)code) >= (((JLONG)1) << si))  | 
317  | 0  |       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);  | 
318  | 191k  |     code <<= 1;  | 
319  | 191k  |     si++;  | 
320  | 191k  |   }  | 
321  |  |  | 
322  |  |   /* Figure C.3: generate encoding tables */  | 
323  |  |   /* These are code and size indexed by symbol value */  | 
324  |  |  | 
325  |  |   /* Set all codeless symbols to have code length 0;  | 
326  |  |    * this lets us detect duplicate VAL entries here, and later  | 
327  |  |    * allows emit_bits to detect any attempt to emit such symbols.  | 
328  |  |    */  | 
329  | 54.9k  |   memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));  | 
330  | 54.9k  |   memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));  | 
331  |  |  | 
332  |  |   /* This is also a convenient place to check for out-of-range and duplicated  | 
333  |  |    * VAL entries.  We allow 0..255 for AC symbols but only 0..15 for DC in  | 
334  |  |    * lossy mode and 0..16 for DC in lossless mode.  (We could constrain them  | 
335  |  |    * further based on data depth and mode, but this seems enough.)  | 
336  |  |    */  | 
337  | 54.9k  |   maxsymbol = isDC ? (cinfo->master->lossless ? 16 : 15) : 255;  | 
338  |  |  | 
339  | 523k  |   for (p = 0; p < lastp; p++) { | 
340  | 468k  |     i = htbl->huffval[p];  | 
341  | 468k  |     if (i < 0 || i > maxsymbol || dtbl->ehufsi[i])  | 
342  | 0  |       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);  | 
343  | 468k  |     dtbl->ehufco[i] = huffcode[p];  | 
344  | 468k  |     dtbl->ehufsi[i] = huffsize[p];  | 
345  | 468k  |   }  | 
346  | 54.9k  | }  | 
347  |  |  | 
348  |  |  | 
349  |  | /* Outputting bytes to the file */  | 
350  |  |  | 
351  |  | /* Emit a byte, taking 'action' if must suspend. */  | 
352  | 0  | #define emit_byte(state, val, action) { \ | 
353  | 0  |   *(state)->next_output_byte++ = (JOCTET)(val); \  | 
354  | 0  |   if (--(state)->free_in_buffer == 0) \  | 
355  | 0  |     if (!dump_buffer(state)) \  | 
356  | 0  |       { action; } \ | 
357  | 0  | }  | 
358  |  |  | 
359  |  |  | 
360  |  | LOCAL(boolean)  | 
361  |  | dump_buffer(working_state *state)  | 
362  |  | /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */  | 
363  | 0  | { | 
364  | 0  |   struct jpeg_destination_mgr *dest = state->cinfo->dest;  | 
365  |  | 
  | 
366  | 0  |   if (!(*dest->empty_output_buffer) (state->cinfo))  | 
367  | 0  |     return FALSE;  | 
368  |  |   /* After a successful buffer dump, must reset buffer pointers */  | 
369  | 0  |   state->next_output_byte = dest->next_output_byte;  | 
370  | 0  |   state->free_in_buffer = dest->free_in_buffer;  | 
371  | 0  |   return TRUE;  | 
372  | 0  | }  | 
373  |  |  | 
374  |  |  | 
375  |  | /* Outputting bits to the file */  | 
376  |  |  | 
377  |  | /* Output byte b and, speculatively, an additional 0 byte.  0xFF must be  | 
378  |  |  * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the  | 
379  |  |  * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and  | 
380  |  |  * the speculative 0 byte will be overwritten by the next byte.  | 
381  |  |  */  | 
382  | 31.5k  | #define EMIT_BYTE(b) { \ | 
383  | 31.5k  |   buffer[0] = (JOCTET)(b); \  | 
384  | 31.5k  |   buffer[1] = 0; \  | 
385  | 31.5k  |   buffer -= -2 + ((JOCTET)(b) < 0xFF); \  | 
386  | 31.5k  | }  | 
387  |  |  | 
388  |  | /* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write  | 
389  |  |  * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to  | 
390  |  |  * encode 0xFF as 0xFF 0x00.  | 
391  |  |  */  | 
392  |  | #if BIT_BUF_SIZE == 64  | 
393  |  |  | 
394  | 0  | #define FLUSH() { \ | 
395  | 0  |   if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ | 
396  | 0  |     EMIT_BYTE(put_buffer >> 56) \  | 
397  | 0  |     EMIT_BYTE(put_buffer >> 48) \  | 
398  | 0  |     EMIT_BYTE(put_buffer >> 40) \  | 
399  | 0  |     EMIT_BYTE(put_buffer >> 32) \  | 
400  | 0  |     EMIT_BYTE(put_buffer >> 24) \  | 
401  | 0  |     EMIT_BYTE(put_buffer >> 16) \  | 
402  | 0  |     EMIT_BYTE(put_buffer >>  8) \  | 
403  | 0  |     EMIT_BYTE(put_buffer      ) \  | 
404  | 0  |   } else { \ | 
405  | 0  |     buffer[0] = (JOCTET)(put_buffer >> 56); \  | 
406  | 0  |     buffer[1] = (JOCTET)(put_buffer >> 48); \  | 
407  | 0  |     buffer[2] = (JOCTET)(put_buffer >> 40); \  | 
408  | 0  |     buffer[3] = (JOCTET)(put_buffer >> 32); \  | 
409  | 0  |     buffer[4] = (JOCTET)(put_buffer >> 24); \  | 
410  | 0  |     buffer[5] = (JOCTET)(put_buffer >> 16); \  | 
411  | 0  |     buffer[6] = (JOCTET)(put_buffer >> 8); \  | 
412  | 0  |     buffer[7] = (JOCTET)(put_buffer); \  | 
413  | 0  |     buffer += 8; \  | 
414  | 0  |   } \  | 
415  | 0  | }  | 
416  |  |  | 
417  |  | #else  | 
418  |  |  | 
419  |  | #define FLUSH() { \ | 
420  |  |   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \ | 
421  |  |     EMIT_BYTE(put_buffer >> 24) \  | 
422  |  |     EMIT_BYTE(put_buffer >> 16) \  | 
423  |  |     EMIT_BYTE(put_buffer >>  8) \  | 
424  |  |     EMIT_BYTE(put_buffer      ) \  | 
425  |  |   } else { \ | 
426  |  |     buffer[0] = (JOCTET)(put_buffer >> 24); \  | 
427  |  |     buffer[1] = (JOCTET)(put_buffer >> 16); \  | 
428  |  |     buffer[2] = (JOCTET)(put_buffer >> 8); \  | 
429  |  |     buffer[3] = (JOCTET)(put_buffer); \  | 
430  |  |     buffer += 4; \  | 
431  |  |   } \  | 
432  |  | }  | 
433  |  |  | 
434  |  | #endif  | 
435  |  |  | 
436  |  | /* Fill the bit buffer to capacity with the leading bits from code, then output  | 
437  |  |  * the bit buffer and put the remaining bits from code into the bit buffer.  | 
438  |  |  */  | 
439  | 0  | #define PUT_AND_FLUSH(code, size) { \ | 
440  | 0  |   put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \  | 
441  | 0  |   FLUSH() \  | 
442  | 0  |   free_bits += BIT_BUF_SIZE; \  | 
443  | 0  |   put_buffer = code; \  | 
444  | 0  | }  | 
445  |  |  | 
446  |  | /* Insert code into the bit buffer and output the bit buffer if needed.  | 
447  |  |  * NOTE: We can't flush with free_bits == 0, since the left shift in  | 
448  |  |  * PUT_AND_FLUSH() would have undefined behavior.  | 
449  |  |  */  | 
450  | 0  | #define PUT_BITS(code, size) { \ | 
451  | 0  |   free_bits -= size; \  | 
452  | 0  |   if (free_bits < 0) \  | 
453  | 0  |     PUT_AND_FLUSH(code, size) \  | 
454  | 0  |   else \  | 
455  | 0  |     put_buffer = (put_buffer << size) | code; \  | 
456  | 0  | }  | 
457  |  |  | 
458  | 0  | #define PUT_CODE(code, size) { \ | 
459  | 0  |   temp &= (((JLONG)1) << nbits) - 1; \  | 
460  | 0  |   temp |= code << nbits; \  | 
461  | 0  |   nbits += size; \  | 
462  | 0  |   PUT_BITS(temp, nbits) \  | 
463  | 0  | }  | 
464  |  |  | 
465  |  |  | 
466  |  | /* Although it is exceedingly rare, it is possible for a Huffman-encoded  | 
467  |  |  * coefficient block to be larger than the 128-byte unencoded block.  For each  | 
468  |  |  * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can  | 
469  |  |  * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per  | 
470  |  |  * encoded block.)  If, for instance, one artificially sets the AC  | 
471  |  |  * coefficients to alternating values of 32767 and -32768 (using the JPEG  | 
472  |  |  * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block  | 
473  |  |  * larger than 200 bytes.  | 
474  |  |  */  | 
475  | 10.1M  | #define BUFSIZE  (DCTSIZE2 * 8)  | 
476  |  |  | 
477  | 10.1M  | #define LOAD_BUFFER() { \ | 
478  | 10.1M  |   if (state->free_in_buffer < BUFSIZE) { \ | 
479  | 0  |     localbuf = 1; \  | 
480  | 0  |     buffer = _buffer; \  | 
481  | 0  |   } else \  | 
482  | 10.1M  |     buffer = state->next_output_byte; \  | 
483  | 10.1M  | }  | 
484  |  |  | 
485  | 10.1M  | #define STORE_BUFFER() { \ | 
486  | 10.1M  |   if (localbuf) { \ | 
487  | 0  |     size_t bytes, bytestocopy; \  | 
488  | 0  |     bytes = buffer - _buffer; \  | 
489  | 0  |     buffer = _buffer; \  | 
490  | 0  |     while (bytes > 0) { \ | 
491  | 0  |       bytestocopy = MIN(bytes, state->free_in_buffer); \  | 
492  | 0  |       memcpy(state->next_output_byte, buffer, bytestocopy); \  | 
493  | 0  |       state->next_output_byte += bytestocopy; \  | 
494  | 0  |       buffer += bytestocopy; \  | 
495  | 0  |       state->free_in_buffer -= bytestocopy; \  | 
496  | 0  |       if (state->free_in_buffer == 0) \  | 
497  | 0  |         if (!dump_buffer(state)) return FALSE; \  | 
498  | 0  |       bytes -= bytestocopy; \  | 
499  | 0  |     } \  | 
500  | 10.1M  |   } else { \ | 
501  | 10.1M  |     state->free_in_buffer -= (buffer - state->next_output_byte); \  | 
502  | 10.1M  |     state->next_output_byte = buffer; \  | 
503  | 10.1M  |   } \  | 
504  | 10.1M  | }  | 
505  |  |  | 
506  |  |  | 
507  |  | LOCAL(boolean)  | 
508  |  | flush_bits(working_state *state)  | 
509  | 7.33k  | { | 
510  | 7.33k  |   JOCTET _buffer[BUFSIZE], *buffer, temp;  | 
511  | 7.33k  |   simd_bit_buf_type put_buffer;  int put_bits;  | 
512  | 7.33k  |   int localbuf = 0;  | 
513  |  |  | 
514  | 7.33k  |   if (state->simd) { | 
515  |  | #if defined(__aarch64__) && !defined(NEON_INTRINSICS)  | 
516  |  |     put_bits = state->cur.free_bits;  | 
517  |  | #else  | 
518  | 7.33k  |     put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;  | 
519  | 7.33k  | #endif  | 
520  | 7.33k  |     put_buffer = state->cur.put_buffer.simd;  | 
521  | 7.33k  |   } else { | 
522  | 0  |     put_bits = BIT_BUF_SIZE - state->cur.free_bits;  | 
523  | 0  |     put_buffer = state->cur.put_buffer.c;  | 
524  | 0  |   }  | 
525  |  |  | 
526  | 7.33k  |   LOAD_BUFFER()  | 
527  |  |  | 
528  | 32.5k  |   while (put_bits >= 8) { | 
529  | 25.1k  |     put_bits -= 8;  | 
530  | 25.1k  |     temp = (JOCTET)(put_buffer >> put_bits);  | 
531  | 25.1k  |     EMIT_BYTE(temp)  | 
532  | 25.1k  |   }  | 
533  | 7.33k  |   if (put_bits) { | 
534  |  |     /* fill partial byte with ones */  | 
535  | 6.39k  |     temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));  | 
536  | 6.39k  |     EMIT_BYTE(temp)  | 
537  | 6.39k  |   }  | 
538  |  |  | 
539  | 7.33k  |   if (state->simd) {                    /* and reset bit buffer to empty */ | 
540  | 7.33k  |     state->cur.put_buffer.simd = 0;  | 
541  |  | #if defined(__aarch64__) && !defined(NEON_INTRINSICS)  | 
542  |  |     state->cur.free_bits = 0;  | 
543  |  | #else  | 
544  | 7.33k  |     state->cur.free_bits = SIMD_BIT_BUF_SIZE;  | 
545  | 7.33k  | #endif  | 
546  | 7.33k  |   } else { | 
547  | 0  |     state->cur.put_buffer.c = 0;  | 
548  | 0  |     state->cur.free_bits = BIT_BUF_SIZE;  | 
549  | 0  |   }  | 
550  | 7.33k  |   STORE_BUFFER()  | 
551  |  |  | 
552  | 7.33k  |   return TRUE;  | 
553  | 7.33k  | }  | 
554  |  |  | 
555  |  |  | 
556  |  | #ifdef WITH_SIMD  | 
557  |  |  | 
558  |  | /* Encode a single block's worth of coefficients */  | 
559  |  |  | 
560  |  | LOCAL(boolean)  | 
561  |  | encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,  | 
562  |  |                       c_derived_tbl *dctbl, c_derived_tbl *actbl)  | 
563  | 10.1M  | { | 
564  | 10.1M  |   JOCTET _buffer[BUFSIZE], *buffer;  | 
565  | 10.1M  |   int localbuf = 0;  | 
566  |  |  | 
567  | 10.1M  |   LOAD_BUFFER()  | 
568  |  |  | 
569  | 10.1M  |   buffer = jsimd_huff_encode_one_block(state, buffer, block, last_dc_val,  | 
570  | 10.1M  |                                        dctbl, actbl);  | 
571  |  |  | 
572  | 10.1M  |   STORE_BUFFER()  | 
573  |  |  | 
574  | 10.1M  |   return TRUE;  | 
575  | 10.1M  | }  | 
576  |  |  | 
577  |  | #endif  | 
578  |  |  | 
579  |  | LOCAL(boolean)  | 
580  |  | encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,  | 
581  |  |                  c_derived_tbl *dctbl, c_derived_tbl *actbl)  | 
582  | 0  | { | 
583  | 0  |   int temp, nbits, free_bits;  | 
584  | 0  |   bit_buf_type put_buffer;  | 
585  | 0  |   JOCTET _buffer[BUFSIZE], *buffer;  | 
586  | 0  |   int localbuf = 0;  | 
587  |  | 
  | 
588  | 0  |   free_bits = state->cur.free_bits;  | 
589  | 0  |   put_buffer = state->cur.put_buffer.c;  | 
590  | 0  |   LOAD_BUFFER()  | 
591  |  |  | 
592  |  |   /* Encode the DC coefficient difference per section F.1.2.1 */  | 
593  |  | 
  | 
594  | 0  |   temp = block[0] - last_dc_val;  | 
595  |  |  | 
596  |  |   /* This is a well-known technique for obtaining the absolute value without a  | 
597  |  |    * branch.  It is derived from an assembly language technique presented in  | 
598  |  |    * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by  | 
599  |  |    * Agner Fog.  This code assumes we are on a two's complement machine.  | 
600  |  |    */  | 
601  | 0  |   nbits = temp >> (CHAR_BIT * sizeof(int) - 1);  | 
602  | 0  |   temp += nbits;  | 
603  | 0  |   nbits ^= temp;  | 
604  |  |  | 
605  |  |   /* Find the number of bits needed for the magnitude of the coefficient */  | 
606  | 0  |   nbits = JPEG_NBITS(nbits);  | 
607  |  |  | 
608  |  |   /* Emit the Huffman-coded symbol for the number of bits.  | 
609  |  |    * Emit that number of bits of the value, if positive,  | 
610  |  |    * or the complement of its magnitude, if negative.  | 
611  |  |    */  | 
612  | 0  |   PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])  | 
613  |  |  | 
614  |  |   /* Encode the AC coefficients per section F.1.2.2 */  | 
615  |  | 
  | 
616  | 0  |   { | 
617  | 0  |     int r = 0;                  /* r = run length of zeros */  | 
618  |  |  | 
619  |  | /* Manually unroll the k loop to eliminate the counter variable.  This  | 
620  |  |  * improves performance greatly on systems with a limited number of  | 
621  |  |  * registers (such as x86.)  | 
622  |  |  */  | 
623  | 0  | #define kloop(jpeg_natural_order_of_k) { \ | 
624  | 0  |   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \ | 
625  | 0  |     r += 16; \  | 
626  | 0  |   } else { \ | 
627  |  |     /* Branch-less absolute value, bitwise complement, etc., same as above */ \  | 
628  | 0  |     nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \  | 
629  | 0  |     temp += nbits; \  | 
630  | 0  |     nbits ^= temp; \  | 
631  | 0  |     nbits = JPEG_NBITS_NONZERO(nbits); \  | 
632  |  |     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \  | 
633  | 0  |     while (r >= 16 * 16) { \ | 
634  | 0  |       r -= 16 * 16; \  | 
635  | 0  |       PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \  | 
636  | 0  |     } \  | 
637  |  |     /* Emit Huffman symbol for run length / number of bits */ \  | 
638  | 0  |     r += nbits; \  | 
639  | 0  |     PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \  | 
640  | 0  |     r = 0; \  | 
641  | 0  |   } \  | 
642  | 0  | }  | 
643  |  |  | 
644  |  |     /* One iteration for each value in jpeg_natural_order[] */  | 
645  | 0  |     kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);  | 
646  | 0  |     kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);  | 
647  | 0  |     kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);  | 
648  | 0  |     kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);  | 
649  | 0  |     kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);  | 
650  | 0  |     kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);  | 
651  | 0  |     kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);  | 
652  | 0  |     kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);  | 
653  | 0  |     kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);  | 
654  | 0  |     kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);  | 
655  | 0  |     kloop(55);  kloop(62);  kloop(63);  | 
656  |  |  | 
657  |  |     /* If the last coef(s) were zero, emit an end-of-block code */  | 
658  | 0  |     if (r > 0) { | 
659  | 0  |       PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])  | 
660  | 0  |     }  | 
661  | 0  |   }  | 
662  |  | 
  | 
663  | 0  |   state->cur.put_buffer.c = put_buffer;  | 
664  | 0  |   state->cur.free_bits = free_bits;  | 
665  | 0  |   STORE_BUFFER()  | 
666  |  |  | 
667  | 0  |   return TRUE;  | 
668  | 0  | }  | 
669  |  |  | 
670  |  |  | 
671  |  | /*  | 
672  |  |  * Emit a restart marker & resynchronize predictions.  | 
673  |  |  */  | 
674  |  |  | 
675  |  | LOCAL(boolean)  | 
676  |  | emit_restart(working_state *state, int restart_num)  | 
677  | 0  | { | 
678  | 0  |   int ci;  | 
679  |  | 
  | 
680  | 0  |   if (!flush_bits(state))  | 
681  | 0  |     return FALSE;  | 
682  |  |  | 
683  | 0  |   emit_byte(state, 0xFF, return FALSE);  | 
684  | 0  |   emit_byte(state, JPEG_RST0 + restart_num, return FALSE);  | 
685  |  |  | 
686  |  |   /* Re-initialize DC predictions to 0 */  | 
687  | 0  |   for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)  | 
688  | 0  |     state->cur.last_dc_val[ci] = 0;  | 
689  |  |  | 
690  |  |   /* The restart counter is not updated until we successfully write the MCU. */  | 
691  |  | 
  | 
692  | 0  |   return TRUE;  | 
693  | 0  | }  | 
694  |  |  | 
695  |  |  | 
696  |  | /*  | 
697  |  |  * Encode and output one MCU's worth of Huffman-compressed coefficients.  | 
698  |  |  */  | 
699  |  |  | 
700  |  | METHODDEF(boolean)  | 
701  |  | encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)  | 
702  | 4.22M  | { | 
703  | 4.22M  |   huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;  | 
704  | 4.22M  |   working_state state;  | 
705  | 4.22M  |   int blkn, ci;  | 
706  | 4.22M  |   jpeg_component_info *compptr;  | 
707  |  |  | 
708  |  |   /* Load up working state */  | 
709  | 4.22M  |   state.next_output_byte = cinfo->dest->next_output_byte;  | 
710  | 4.22M  |   state.free_in_buffer = cinfo->dest->free_in_buffer;  | 
711  | 4.22M  |   state.cur = entropy->saved;  | 
712  | 4.22M  |   state.cinfo = cinfo;  | 
713  | 4.22M  |   state.simd = entropy->simd;  | 
714  |  |  | 
715  |  |   /* Emit restart marker if needed */  | 
716  | 4.22M  |   if (cinfo->restart_interval) { | 
717  | 0  |     if (entropy->restarts_to_go == 0)  | 
718  | 0  |       if (!emit_restart(&state, entropy->next_restart_num))  | 
719  | 0  |         return FALSE;  | 
720  | 0  |   }  | 
721  |  |  | 
722  |  |   /* Encode the MCU data blocks */  | 
723  | 4.22M  | #ifdef WITH_SIMD  | 
724  | 4.22M  |   if (entropy->simd) { | 
725  | 14.4M  |     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { | 
726  | 10.1M  |       ci = cinfo->MCU_membership[blkn];  | 
727  | 10.1M  |       compptr = cinfo->cur_comp_info[ci];  | 
728  | 10.1M  |       if (!encode_one_block_simd(&state,  | 
729  | 10.1M  |                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],  | 
730  | 10.1M  |                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],  | 
731  | 10.1M  |                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))  | 
732  | 0  |         return FALSE;  | 
733  |  |       /* Update last_dc_val */  | 
734  | 10.1M  |       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];  | 
735  | 10.1M  |     }  | 
736  | 4.22M  |   } else  | 
737  | 0  | #endif  | 
738  | 0  |   { | 
739  | 0  |     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { | 
740  | 0  |       ci = cinfo->MCU_membership[blkn];  | 
741  | 0  |       compptr = cinfo->cur_comp_info[ci];  | 
742  | 0  |       if (!encode_one_block(&state,  | 
743  | 0  |                             MCU_data[blkn][0], state.cur.last_dc_val[ci],  | 
744  | 0  |                             entropy->dc_derived_tbls[compptr->dc_tbl_no],  | 
745  | 0  |                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))  | 
746  | 0  |         return FALSE;  | 
747  |  |       /* Update last_dc_val */  | 
748  | 0  |       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];  | 
749  | 0  |     }  | 
750  | 0  |   }  | 
751  |  |  | 
752  |  |   /* Completed MCU, so update state */  | 
753  | 4.22M  |   cinfo->dest->next_output_byte = state.next_output_byte;  | 
754  | 4.22M  |   cinfo->dest->free_in_buffer = state.free_in_buffer;  | 
755  | 4.22M  |   entropy->saved = state.cur;  | 
756  |  |  | 
757  |  |   /* Update restart-interval state too */  | 
758  | 4.22M  |   if (cinfo->restart_interval) { | 
759  | 0  |     if (entropy->restarts_to_go == 0) { | 
760  | 0  |       entropy->restarts_to_go = cinfo->restart_interval;  | 
761  | 0  |       entropy->next_restart_num++;  | 
762  | 0  |       entropy->next_restart_num &= 7;  | 
763  | 0  |     }  | 
764  | 0  |     entropy->restarts_to_go--;  | 
765  | 0  |   }  | 
766  |  |  | 
767  | 4.22M  |   return TRUE;  | 
768  | 4.22M  | }  | 
769  |  |  | 
770  |  |  | 
771  |  | /*  | 
772  |  |  * Finish up at the end of a Huffman-compressed scan.  | 
773  |  |  */  | 
774  |  |  | 
775  |  | METHODDEF(void)  | 
776  |  | finish_pass_huff(j_compress_ptr cinfo)  | 
777  | 7.33k  | { | 
778  | 7.33k  |   huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;  | 
779  | 7.33k  |   working_state state;  | 
780  |  |  | 
781  |  |   /* Load up working state ... flush_bits needs it */  | 
782  | 7.33k  |   state.next_output_byte = cinfo->dest->next_output_byte;  | 
783  | 7.33k  |   state.free_in_buffer = cinfo->dest->free_in_buffer;  | 
784  | 7.33k  |   state.cur = entropy->saved;  | 
785  | 7.33k  |   state.cinfo = cinfo;  | 
786  | 7.33k  |   state.simd = entropy->simd;  | 
787  |  |  | 
788  |  |   /* Flush out the last data */  | 
789  | 7.33k  |   if (!flush_bits(&state))  | 
790  | 0  |     ERREXIT(cinfo, JERR_CANT_SUSPEND);  | 
791  |  |  | 
792  |  |   /* Update state */  | 
793  | 7.33k  |   cinfo->dest->next_output_byte = state.next_output_byte;  | 
794  | 7.33k  |   cinfo->dest->free_in_buffer = state.free_in_buffer;  | 
795  | 7.33k  |   entropy->saved = state.cur;  | 
796  | 7.33k  | }  | 
797  |  |  | 
798  |  |  | 
799  |  | /*  | 
800  |  |  * Huffman coding optimization.  | 
801  |  |  *  | 
802  |  |  * We first scan the supplied data and count the number of uses of each symbol  | 
803  |  |  * that is to be Huffman-coded. (This process MUST agree with the code above.)  | 
804  |  |  * Then we build a Huffman coding tree for the observed counts.  | 
805  |  |  * Symbols which are not needed at all for the particular image are not  | 
806  |  |  * assigned any code, which saves space in the DHT marker as well as in  | 
807  |  |  * the compressed data.  | 
808  |  |  */  | 
809  |  |  | 
810  |  | #ifdef ENTROPY_OPT_SUPPORTED  | 
811  |  |  | 
812  |  |  | 
813  |  | /* Process a single block's worth of coefficients */  | 
814  |  |  | 
815  |  | LOCAL(void)  | 
816  |  | htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,  | 
817  |  |                 long dc_counts[], long ac_counts[])  | 
818  | 10.1M  | { | 
819  | 10.1M  |   register int temp;  | 
820  | 10.1M  |   register int nbits;  | 
821  | 10.1M  |   register int k, r;  | 
822  | 10.1M  |   int max_coef_bits = cinfo->data_precision + 2;  | 
823  |  |  | 
824  |  |   /* Encode the DC coefficient difference per section F.1.2.1 */  | 
825  |  |  | 
826  | 10.1M  |   temp = block[0] - last_dc_val;  | 
827  | 10.1M  |   if (temp < 0)  | 
828  | 1.01M  |     temp = -temp;  | 
829  |  |  | 
830  |  |   /* Find the number of bits needed for the magnitude of the coefficient */  | 
831  | 10.1M  |   nbits = 0;  | 
832  | 27.5M  |   while (temp) { | 
833  | 17.3M  |     nbits++;  | 
834  | 17.3M  |     temp >>= 1;  | 
835  | 17.3M  |   }  | 
836  |  |   /* Check for out-of-range coefficient values.  | 
837  |  |    * Since we're encoding a difference, the range limit is twice as much.  | 
838  |  |    */  | 
839  | 10.1M  |   if (nbits > max_coef_bits + 1)  | 
840  | 0  |     ERREXIT(cinfo, JERR_BAD_DCT_COEF);  | 
841  |  |  | 
842  |  |   /* Count the Huffman symbol for the number of bits */  | 
843  | 10.1M  |   dc_counts[nbits]++;  | 
844  |  |  | 
845  |  |   /* Encode the AC coefficients per section F.1.2.2 */  | 
846  |  |  | 
847  | 10.1M  |   r = 0;                        /* r = run length of zeros */  | 
848  |  |  | 
849  | 651M  |   for (k = 1; k < DCTSIZE2; k++) { | 
850  | 641M  |     if ((temp = block[jpeg_natural_order[k]]) == 0) { | 
851  | 616M  |       r++;  | 
852  | 616M  |     } else { | 
853  |  |       /* if run length > 15, must emit special run-length-16 codes (0xF0) */  | 
854  | 24.8M  |       while (r > 15) { | 
855  | 819  |         ac_counts[0xF0]++;  | 
856  | 819  |         r -= 16;  | 
857  | 819  |       }  | 
858  |  |  | 
859  |  |       /* Find the number of bits needed for the magnitude of the coefficient */  | 
860  | 24.8M  |       if (temp < 0)  | 
861  | 12.5M  |         temp = -temp;  | 
862  |  |  | 
863  |  |       /* Find the number of bits needed for the magnitude of the coefficient */  | 
864  | 24.8M  |       nbits = 1;                /* there must be at least one 1 bit */  | 
865  | 152M  |       while ((temp >>= 1))  | 
866  | 127M  |         nbits++;  | 
867  |  |       /* Check for out-of-range coefficient values */  | 
868  | 24.8M  |       if (nbits > max_coef_bits)  | 
869  | 0  |         ERREXIT(cinfo, JERR_BAD_DCT_COEF);  | 
870  |  |  | 
871  |  |       /* Count Huffman symbol for run length / number of bits */  | 
872  | 24.8M  |       ac_counts[(r << 4) + nbits]++;  | 
873  |  |  | 
874  | 24.8M  |       r = 0;  | 
875  | 24.8M  |     }  | 
876  | 641M  |   }  | 
877  |  |  | 
878  |  |   /* If the last coef(s) were zero, emit an end-of-block code */  | 
879  | 10.1M  |   if (r > 0)  | 
880  | 9.99M  |     ac_counts[0]++;  | 
881  | 10.1M  | }  | 
882  |  |  | 
883  |  |  | 
884  |  | /*  | 
885  |  |  * Trial-encode one MCU's worth of Huffman-compressed coefficients.  | 
886  |  |  * No data is actually output, so no suspension return is possible.  | 
887  |  |  */  | 
888  |  |  | 
889  |  | METHODDEF(boolean)  | 
890  |  | encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)  | 
891  | 4.22M  | { | 
892  | 4.22M  |   huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;  | 
893  | 4.22M  |   int blkn, ci;  | 
894  | 4.22M  |   jpeg_component_info *compptr;  | 
895  |  |  | 
896  |  |   /* Take care of restart intervals if needed */  | 
897  | 4.22M  |   if (cinfo->restart_interval) { | 
898  | 0  |     if (entropy->restarts_to_go == 0) { | 
899  |  |       /* Re-initialize DC predictions to 0 */  | 
900  | 0  |       for (ci = 0; ci < cinfo->comps_in_scan; ci++)  | 
901  | 0  |         entropy->saved.last_dc_val[ci] = 0;  | 
902  |  |       /* Update restart state */  | 
903  | 0  |       entropy->restarts_to_go = cinfo->restart_interval;  | 
904  | 0  |     }  | 
905  | 0  |     entropy->restarts_to_go--;  | 
906  | 0  |   }  | 
907  |  |  | 
908  | 14.4M  |   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { | 
909  | 10.1M  |     ci = cinfo->MCU_membership[blkn];  | 
910  | 10.1M  |     compptr = cinfo->cur_comp_info[ci];  | 
911  | 10.1M  |     htest_one_block(cinfo, MCU_data[blkn][0], entropy->saved.last_dc_val[ci],  | 
912  | 10.1M  |                     entropy->dc_count_ptrs[compptr->dc_tbl_no],  | 
913  | 10.1M  |                     entropy->ac_count_ptrs[compptr->ac_tbl_no]);  | 
914  | 10.1M  |     entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0];  | 
915  | 10.1M  |   }  | 
916  |  |  | 
917  | 4.22M  |   return TRUE;  | 
918  | 4.22M  | }  | 
919  |  |  | 
920  |  |  | 
921  |  | /*  | 
922  |  |  * Generate the best Huffman code table for the given counts, fill htbl.  | 
923  |  |  * Note this is also used by jcphuff.c and jclhuff.c.  | 
924  |  |  *  | 
925  |  |  * The JPEG standard requires that no symbol be assigned a codeword of all  | 
926  |  |  * one bits (so that padding bits added at the end of a compressed segment  | 
927  |  |  * can't look like a valid code).  Because of the canonical ordering of  | 
928  |  |  * codewords, this just means that there must be an unused slot in the  | 
929  |  |  * longest codeword length category.  Annex K (Clause K.2) of  | 
930  |  |  * Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 suggests reserving such a slot  | 
931  |  |  * by pretending that symbol 256 is a valid symbol with count 1.  In theory  | 
932  |  |  * that's not optimal; giving it count zero but including it in the symbol set  | 
933  |  |  * anyway should give a better Huffman code.  But the theoretically better code  | 
934  |  |  * actually seems to come out worse in practice, because it produces more  | 
935  |  |  * all-ones bytes (which incur stuffed zero bytes in the final file).  In any  | 
936  |  |  * case the difference is tiny.  | 
937  |  |  *  | 
938  |  |  * The JPEG standard requires Huffman codes to be no more than 16 bits long.  | 
939  |  |  * If some symbols have a very small but nonzero probability, the Huffman tree  | 
940  |  |  * must be adjusted to meet the code length restriction.  We currently use  | 
941  |  |  * the adjustment method suggested in JPEG section K.2.  This method is *not*  | 
942  |  |  * optimal; it may not choose the best possible limited-length code.  But  | 
943  |  |  * typically only very-low-frequency symbols will be given less-than-optimal  | 
944  |  |  * lengths, so the code is almost optimal.  Experimental comparisons against  | 
945  |  |  * an optimal limited-length-code algorithm indicate that the difference is  | 
946  |  |  * microscopic --- usually less than a hundredth of a percent of total size.  | 
947  |  |  * So the extra complexity of an optimal algorithm doesn't seem worthwhile.  | 
948  |  |  */  | 
949  |  |  | 
950  |  | GLOBAL(void)  | 
951  |  | jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])  | 
952  | 41.4k  | { | 
953  | 1.83M  | #define MAX_CLEN  32            /* assumed maximum initial code length */  | 
954  | 41.4k  |   UINT8 bits[MAX_CLEN + 1];     /* bits[k] = # of symbols with code length k */  | 
955  | 41.4k  |   int bit_pos[MAX_CLEN + 1];    /* # of symbols with smaller code length */  | 
956  | 41.4k  |   int codesize[257];            /* codesize[k] = code length of symbol k */  | 
957  | 41.4k  |   int nz_index[257];            /* index of nonzero symbol in the original freq  | 
958  |  |                                    array */  | 
959  | 41.4k  |   int others[257];              /* next symbol in current branch of tree */  | 
960  | 41.4k  |   int c1, c2;  | 
961  | 41.4k  |   int p, i, j;  | 
962  | 41.4k  |   int num_nz_symbols;  | 
963  | 41.4k  |   long v, v2;  | 
964  |  |  | 
965  |  |   /* This algorithm is explained in section K.2 of the JPEG standard */  | 
966  |  |  | 
967  | 41.4k  |   memset(bits, 0, sizeof(bits));  | 
968  | 41.4k  |   memset(codesize, 0, sizeof(codesize));  | 
969  | 10.7M  |   for (i = 0; i < 257; i++)  | 
970  | 10.6M  |     others[i] = -1;             /* init links to empty */  | 
971  |  |  | 
972  | 41.4k  |   freq[256] = 1;                /* make sure 256 has a nonzero count */  | 
973  |  |   /* Including the pseudo-symbol 256 in the Huffman procedure guarantees  | 
974  |  |    * that no real symbol is given code-value of all ones, because 256  | 
975  |  |    * will be placed last in the largest codeword category.  | 
976  |  |    */  | 
977  |  |  | 
978  |  |   /* Group nonzero frequencies together so we can more easily find the  | 
979  |  |    * smallest.  | 
980  |  |    */  | 
981  | 41.4k  |   num_nz_symbols = 0;  | 
982  | 10.7M  |   for (i = 0; i < 257; i++) { | 
983  | 10.6M  |     if (freq[i]) { | 
984  | 422k  |       nz_index[num_nz_symbols] = i;  | 
985  | 422k  |       freq[num_nz_symbols] = freq[i];  | 
986  | 422k  |       num_nz_symbols++;  | 
987  | 422k  |     }  | 
988  | 10.6M  |   }  | 
989  |  |  | 
990  |  |   /* Huffman's basic algorithm to assign optimal code lengths to symbols */  | 
991  |  |  | 
992  | 422k  |   for (;;) { | 
993  |  |     /* Find the two smallest nonzero frequencies; set c1, c2 = their symbols */  | 
994  |  |     /* In case of ties, take the larger symbol number.  Since we have grouped  | 
995  |  |      * the nonzero symbols together, checking for zero symbols is not  | 
996  |  |      * necessary.  | 
997  |  |      */  | 
998  | 422k  |     c1 = -1;  | 
999  | 422k  |     c2 = -1;  | 
1000  | 422k  |     v = 1000000000L;  | 
1001  | 422k  |     v2 = 1000000000L;  | 
1002  | 13.1M  |     for (i = 0; i < num_nz_symbols; i++) { | 
1003  | 12.7M  |       if (freq[i] <= v2) { | 
1004  | 2.51M  |         if (freq[i] <= v) { | 
1005  | 1.73M  |           c2 = c1;  | 
1006  | 1.73M  |           v2 = v;  | 
1007  | 1.73M  |           v = freq[i];  | 
1008  | 1.73M  |           c1 = i;  | 
1009  | 1.73M  |         } else { | 
1010  | 782k  |           v2 = freq[i];  | 
1011  | 782k  |           c2 = i;  | 
1012  | 782k  |         }  | 
1013  | 2.51M  |       }  | 
1014  | 12.7M  |     }  | 
1015  |  |  | 
1016  |  |     /* Done if we've merged everything into one frequency */  | 
1017  | 422k  |     if (c2 < 0)  | 
1018  | 41.4k  |       break;  | 
1019  |  |  | 
1020  |  |     /* Else merge the two counts/trees */  | 
1021  | 380k  |     freq[c1] += freq[c2];  | 
1022  |  |     /* Set the frequency to a very high value instead of zero, so we don't have  | 
1023  |  |      * to check for zero values.  | 
1024  |  |      */  | 
1025  | 380k  |     freq[c2] = 1000000001L;  | 
1026  |  |  | 
1027  |  |     /* Increment the codesize of everything in c1's tree branch */  | 
1028  | 380k  |     codesize[c1]++;  | 
1029  | 1.16M  |     while (others[c1] >= 0) { | 
1030  | 782k  |       c1 = others[c1];  | 
1031  | 782k  |       codesize[c1]++;  | 
1032  | 782k  |     }  | 
1033  |  |  | 
1034  | 380k  |     others[c1] = c2;            /* chain c2 onto c1's tree branch */  | 
1035  |  |  | 
1036  |  |     /* Increment the codesize of everything in c2's tree branch */  | 
1037  | 380k  |     codesize[c2]++;  | 
1038  | 1.21M  |     while (others[c2] >= 0) { | 
1039  | 830k  |       c2 = others[c2];  | 
1040  | 830k  |       codesize[c2]++;  | 
1041  | 830k  |     }  | 
1042  | 380k  |   }  | 
1043  |  |  | 
1044  |  |   /* Now count the number of symbols of each code length */  | 
1045  | 463k  |   for (i = 0; i < num_nz_symbols; i++) { | 
1046  |  |     /* The JPEG standard seems to think that this can't happen, */  | 
1047  |  |     /* but I'm paranoid... */  | 
1048  | 422k  |     if (codesize[i] > MAX_CLEN)  | 
1049  | 0  |       ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);  | 
1050  |  |  | 
1051  | 422k  |     bits[codesize[i]]++;  | 
1052  | 422k  |   }  | 
1053  |  |  | 
1054  |  |   /* Count the number of symbols with a length smaller than i bits, so we can  | 
1055  |  |    * construct the symbol table more efficiently.  Note that this includes the  | 
1056  |  |    * pseudo-symbol 256, but since it is the last symbol, it will not affect the  | 
1057  |  |    * table.  | 
1058  |  |    */  | 
1059  | 41.4k  |   p = 0;  | 
1060  | 1.36M  |   for (i = 1; i <= MAX_CLEN; i++) { | 
1061  | 1.32M  |     bit_pos[i] = p;  | 
1062  | 1.32M  |     p += bits[i];  | 
1063  | 1.32M  |   }  | 
1064  |  |  | 
1065  |  |   /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure  | 
1066  |  |    * Huffman procedure assigned any such lengths, we must adjust the coding.  | 
1067  |  |    * Here is what Rec. ITU-T T.81 | ISO/IEC 10918-1 says about how this next  | 
1068  |  |    * bit works: Since symbols are paired for the longest Huffman code, the  | 
1069  |  |    * symbols are removed from this length category two at a time.  The prefix  | 
1070  |  |    * for the pair (which is one bit shorter) is allocated to one of the pair;  | 
1071  |  |    * then, skipping the BITS entry for that prefix length, a code word from the  | 
1072  |  |    * next shortest nonzero BITS entry is converted into a prefix for two code  | 
1073  |  |    * words one bit longer.  | 
1074  |  |    */  | 
1075  |  |  | 
1076  | 705k  |   for (i = MAX_CLEN; i > 16; i--) { | 
1077  | 664k  |     while (bits[i] > 0) { | 
1078  | 401  |       j = i - 2;                /* find length of new prefix to be used */  | 
1079  | 536  |       while (bits[j] == 0)  | 
1080  | 135  |         j--;  | 
1081  |  |  | 
1082  | 401  |       bits[i] -= 2;             /* remove two symbols */  | 
1083  | 401  |       bits[i - 1]++;            /* one goes in this length */  | 
1084  | 401  |       bits[j + 1] += 2;         /* two new symbols in this length */  | 
1085  | 401  |       bits[j]--;                /* symbol of this length is now a prefix */  | 
1086  | 401  |     }  | 
1087  | 663k  |   }  | 
1088  |  |  | 
1089  |  |   /* Remove the count for the pseudo-symbol 256 from the largest codelength */  | 
1090  | 537k  |   while (bits[i] == 0)          /* find largest codelength still in use */  | 
1091  | 495k  |     i--;  | 
1092  | 41.4k  |   bits[i]--;  | 
1093  |  |  | 
1094  |  |   /* Return final symbol counts (only for lengths 0..16) */  | 
1095  | 41.4k  |   memcpy(htbl->bits, bits, sizeof(htbl->bits));  | 
1096  |  |  | 
1097  |  |   /* Return a list of the symbols sorted by code length */  | 
1098  |  |   /* It's not real clear to me why we don't need to consider the codelength  | 
1099  |  |    * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think  | 
1100  |  |    * this works.  | 
1101  |  |    */  | 
1102  | 422k  |   for (i = 0; i < num_nz_symbols - 1; i++) { | 
1103  | 380k  |     htbl->huffval[bit_pos[codesize[i]]] = (UINT8)nz_index[i];  | 
1104  | 380k  |     bit_pos[codesize[i]]++;  | 
1105  | 380k  |   }  | 
1106  |  |  | 
1107  |  |   /* Set sent_table FALSE so updated table will be written to JPEG file. */  | 
1108  | 41.4k  |   htbl->sent_table = FALSE;  | 
1109  | 41.4k  | }  | 
1110  |  |  | 
1111  |  |  | 
1112  |  | /*  | 
1113  |  |  * Finish up a statistics-gathering pass and create the new Huffman tables.  | 
1114  |  |  */  | 
1115  |  |  | 
1116  |  | METHODDEF(void)  | 
1117  |  | finish_pass_gather(j_compress_ptr cinfo)  | 
1118  | 7.33k  | { | 
1119  | 7.33k  |   huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;  | 
1120  | 7.33k  |   int ci, dctbl, actbl;  | 
1121  | 7.33k  |   jpeg_component_info *compptr;  | 
1122  | 7.33k  |   JHUFF_TBL **htblptr;  | 
1123  | 7.33k  |   boolean did_dc[NUM_HUFF_TBLS];  | 
1124  | 7.33k  |   boolean did_ac[NUM_HUFF_TBLS];  | 
1125  |  |  | 
1126  |  |   /* It's important not to apply jpeg_gen_optimal_table more than once  | 
1127  |  |    * per table, because it clobbers the input frequency counts!  | 
1128  |  |    */  | 
1129  | 7.33k  |   memset(did_dc, 0, sizeof(did_dc));  | 
1130  | 7.33k  |   memset(did_ac, 0, sizeof(did_ac));  | 
1131  |  |  | 
1132  | 24.2k  |   for (ci = 0; ci < cinfo->comps_in_scan; ci++) { | 
1133  | 16.9k  |     compptr = cinfo->cur_comp_info[ci];  | 
1134  | 16.9k  |     dctbl = compptr->dc_tbl_no;  | 
1135  | 16.9k  |     actbl = compptr->ac_tbl_no;  | 
1136  | 16.9k  |     if (!did_dc[dctbl]) { | 
1137  | 11.1k  |       htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];  | 
1138  | 11.1k  |       if (*htblptr == NULL)  | 
1139  | 0  |         *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);  | 
1140  | 11.1k  |       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);  | 
1141  | 11.1k  |       did_dc[dctbl] = TRUE;  | 
1142  | 11.1k  |     }  | 
1143  | 16.9k  |     if (!did_ac[actbl]) { | 
1144  | 11.1k  |       htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];  | 
1145  | 11.1k  |       if (*htblptr == NULL)  | 
1146  | 0  |         *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);  | 
1147  | 11.1k  |       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);  | 
1148  | 11.1k  |       did_ac[actbl] = TRUE;  | 
1149  | 11.1k  |     }  | 
1150  | 16.9k  |   }  | 
1151  | 7.33k  | }  | 
1152  |  |  | 
1153  |  |  | 
1154  |  | #endif /* ENTROPY_OPT_SUPPORTED */  | 
1155  |  |  | 
1156  |  |  | 
1157  |  | /*  | 
1158  |  |  * Module initialization routine for Huffman entropy encoding.  | 
1159  |  |  */  | 
1160  |  |  | 
1161  |  | GLOBAL(void)  | 
1162  |  | jinit_huff_encoder(j_compress_ptr cinfo)  | 
1163  | 7.33k  | { | 
1164  | 7.33k  |   huff_entropy_ptr entropy;  | 
1165  | 7.33k  |   int i;  | 
1166  |  |  | 
1167  | 7.33k  |   entropy = (huff_entropy_ptr)  | 
1168  | 7.33k  |     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,  | 
1169  | 7.33k  |                                 sizeof(huff_entropy_encoder));  | 
1170  | 7.33k  |   cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;  | 
1171  | 7.33k  |   entropy->pub.start_pass = start_pass_huff;  | 
1172  |  |  | 
1173  |  |   /* Mark tables unallocated */  | 
1174  | 36.6k  |   for (i = 0; i < NUM_HUFF_TBLS; i++) { | 
1175  | 29.3k  |     entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;  | 
1176  | 29.3k  | #ifdef ENTROPY_OPT_SUPPORTED  | 
1177  | 29.3k  |     entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;  | 
1178  | 29.3k  | #endif  | 
1179  | 29.3k  |   }  | 
1180  | 7.33k  | }  |