/src/freeimage-svn/FreeImage/trunk/Source/LibOpenJPEG/dwt.c
Line  | Count  | Source  | 
1  |  | /*  | 
2  |  |  * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium  | 
3  |  |  * Copyright (c) 2002-2007, Professor Benoit Macq  | 
4  |  |  * Copyright (c) 2001-2003, David Janssens  | 
5  |  |  * Copyright (c) 2002-2003, Yannick Verschueren  | 
6  |  |  * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe  | 
7  |  |  * Copyright (c) 2005, Herve Drolon, FreeImage Team  | 
8  |  |  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>  | 
9  |  |  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>  | 
10  |  |  * All rights reserved.  | 
11  |  |  *  | 
12  |  |  * Redistribution and use in source and binary forms, with or without  | 
13  |  |  * modification, are permitted provided that the following conditions  | 
14  |  |  * are met:  | 
15  |  |  * 1. Redistributions of source code must retain the above copyright  | 
16  |  |  *    notice, this list of conditions and the following disclaimer.  | 
17  |  |  * 2. Redistributions in binary form must reproduce the above copyright  | 
18  |  |  *    notice, this list of conditions and the following disclaimer in the  | 
19  |  |  *    documentation and/or other materials provided with the distribution.  | 
20  |  |  *  | 
21  |  |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'  | 
22  |  |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE  | 
23  |  |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  | 
24  |  |  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE  | 
25  |  |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR  | 
26  |  |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF  | 
27  |  |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS  | 
28  |  |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN  | 
29  |  |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)  | 
30  |  |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE  | 
31  |  |  * POSSIBILITY OF SUCH DAMAGE.  | 
32  |  |  */  | 
33  |  |  | 
34  |  | #ifdef __SSE__  | 
35  |  | #include <xmmintrin.h>  | 
36  |  | #endif  | 
37  |  |  | 
38  |  | #include "opj_includes.h"  | 
39  |  |  | 
40  |  | /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */  | 
41  |  | /*@{*/ | 
42  |  |  | 
43  |  | #define OPJ_WS(i) v->mem[(i)*2]  | 
44  |  | #define OPJ_WD(i) v->mem[(1+(i)*2)]  | 
45  |  |  | 
46  |  | /** @name Local data structures */  | 
47  |  | /*@{*/ | 
48  |  |  | 
49  |  | typedef struct dwt_local { | 
50  |  |   OPJ_INT32* mem;  | 
51  |  |   OPJ_INT32 dn;  | 
52  |  |   OPJ_INT32 sn;  | 
53  |  |   OPJ_INT32 cas;  | 
54  |  | } opj_dwt_t;  | 
55  |  |  | 
56  |  | typedef union { | 
57  |  |   OPJ_FLOAT32 f[4];  | 
58  |  | } opj_v4_t;  | 
59  |  |  | 
60  |  | typedef struct v4dwt_local { | 
61  |  |   opj_v4_t* wavelet ;  | 
62  |  |   OPJ_INT32   dn ;  | 
63  |  |   OPJ_INT32   sn ;  | 
64  |  |   OPJ_INT32   cas ;  | 
65  |  | } opj_v4dwt_t ;  | 
66  |  |  | 
67  |  | static const OPJ_FLOAT32 opj_dwt_alpha =  1.586134342f; /*  12994 */  | 
68  |  | static const OPJ_FLOAT32 opj_dwt_beta  =  0.052980118f; /*    434 */  | 
69  |  | static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /*  -7233 */  | 
70  |  | static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /*  -3633 */  | 
71  |  |  | 
72  |  | static const OPJ_FLOAT32 opj_K      = 1.230174105f; /*  10078 */  | 
73  |  | static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;  | 
74  |  |  | 
75  |  | /*@}*/  | 
76  |  |  | 
77  |  | /**  | 
78  |  | Virtual function type for wavelet transform in 1-D   | 
79  |  | */  | 
80  |  | typedef void (*DWT1DFN)(opj_dwt_t* v);  | 
81  |  |  | 
82  |  | /** @name Local static functions */  | 
83  |  | /*@{*/ | 
84  |  |  | 
85  |  | /**  | 
86  |  | Forward lazy transform (horizontal)  | 
87  |  | */  | 
88  |  | static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);  | 
89  |  | /**  | 
90  |  | Forward lazy transform (vertical)  | 
91  |  | */  | 
92  |  | static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);  | 
93  |  | /**  | 
94  |  | Inverse lazy transform (horizontal)  | 
95  |  | */  | 
96  |  | static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a);  | 
97  |  | /**  | 
98  |  | Inverse lazy transform (vertical)  | 
99  |  | */  | 
100  |  | static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x);  | 
101  |  | /**  | 
102  |  | Forward 5-3 wavelet transform in 1-D  | 
103  |  | */  | 
104  |  | static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);  | 
105  |  | /**  | 
106  |  | Inverse 5-3 wavelet transform in 1-D  | 
107  |  | */  | 
108  |  | static void opj_dwt_decode_1(opj_dwt_t *v);  | 
109  |  | static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);  | 
110  |  | /**  | 
111  |  | Forward 9-7 wavelet transform in 1-D  | 
112  |  | */  | 
113  |  | static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);  | 
114  |  | /**  | 
115  |  | Explicit calculation of the Quantization Stepsizes   | 
116  |  | */  | 
117  |  | static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize);  | 
118  |  | /**  | 
119  |  | Inverse wavelet transform in 2-D.  | 
120  |  | */  | 
121  |  | static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);  | 
122  |  |  | 
123  |  | static OPJ_BOOL opj_dwt_encode_procedure( opj_tcd_tilecomp_t * tilec,  | 
124  |  |                         void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) );  | 
125  |  |  | 
126  |  | static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i);  | 
127  |  |  | 
128  |  | /* <summary>                             */  | 
129  |  | /* Inverse 9-7 wavelet transform in 1-D. */  | 
130  |  | /* </summary>                            */  | 
131  |  | static void opj_v4dwt_decode(opj_v4dwt_t* restrict dwt);  | 
132  |  |  | 
133  |  | static void opj_v4dwt_interleave_h(opj_v4dwt_t* restrict w, OPJ_FLOAT32* restrict a, OPJ_INT32 x, OPJ_INT32 size);  | 
134  |  |  | 
135  |  | static void opj_v4dwt_interleave_v(opj_v4dwt_t* restrict v , OPJ_FLOAT32* restrict a , OPJ_INT32 x, OPJ_INT32 nb_elts_read);  | 
136  |  |  | 
137  |  | #ifdef __SSE__  | 
138  |  | static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m128 c);  | 
139  |  |  | 
140  |  | static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_INT32 m, __m128 c);  | 
141  |  |  | 
142  |  | #else  | 
143  |  | static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count, const OPJ_FLOAT32 c);  | 
144  |  |  | 
145  |  | static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_INT32 m, OPJ_FLOAT32 c);  | 
146  |  |  | 
147  |  | #endif  | 
148  |  |  | 
149  |  | /*@}*/  | 
150  |  |  | 
151  |  | /*@}*/  | 
152  |  |  | 
153  | 0  | #define OPJ_S(i) a[(i)*2]  | 
154  | 0  | #define OPJ_D(i) a[(1+(i)*2)]  | 
155  | 0  | #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))  | 
156  | 0  | #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))  | 
157  |  | /* new */  | 
158  | 0  | #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))  | 
159  | 0  | #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))  | 
160  |  |  | 
161  |  | /* <summary>                                                              */  | 
162  |  | /* This table contains the norms of the 5-3 wavelets for different bands. */  | 
163  |  | /* </summary>                                                             */  | 
164  |  | static const OPJ_FLOAT64 opj_dwt_norms[4][10] = { | 
165  |  |   {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3}, | 
166  |  |   {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, | 
167  |  |   {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, | 
168  |  |   {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93} | 
169  |  | };  | 
170  |  |  | 
171  |  | /* <summary>                                                              */  | 
172  |  | /* This table contains the norms of the 9-7 wavelets for different bands. */  | 
173  |  | /* </summary>                                                             */  | 
174  |  | static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { | 
175  |  |   {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9}, | 
176  |  |   {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, | 
177  |  |   {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, | 
178  |  |   {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2} | 
179  |  | };  | 
180  |  |  | 
181  |  | /*   | 
182  |  | ==========================================================  | 
183  |  |    local functions  | 
184  |  | ==========================================================  | 
185  |  | */  | 
186  |  |  | 
187  |  | /* <summary>                       */  | 
188  |  | /* Forward lazy transform (horizontal).  */  | 
189  |  | /* </summary>                            */   | 
190  | 0  | void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { | 
191  | 0  |   OPJ_INT32 i;  | 
192  | 0  |   OPJ_INT32 * l_dest = b;  | 
193  | 0  |   OPJ_INT32 * l_src = a+cas;  | 
194  |  | 
  | 
195  | 0  |     for (i=0; i<sn; ++i) { | 
196  | 0  |     *l_dest++ = *l_src;  | 
197  | 0  |     l_src += 2;  | 
198  | 0  |   }  | 
199  |  |     | 
200  | 0  |     l_dest = b + sn;  | 
201  | 0  |   l_src = a + 1 - cas;  | 
202  |  | 
  | 
203  | 0  |     for (i=0; i<dn; ++i)  { | 
204  | 0  |     *l_dest++=*l_src;  | 
205  | 0  |     l_src += 2;  | 
206  | 0  |   }  | 
207  | 0  | }  | 
208  |  |  | 
209  |  | /* <summary>                             */    | 
210  |  | /* Forward lazy transform (vertical).    */  | 
211  |  | /* </summary>                            */   | 
212  | 0  | void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) { | 
213  | 0  |     OPJ_INT32 i = sn;  | 
214  | 0  |   OPJ_INT32 * l_dest = b;  | 
215  | 0  |   OPJ_INT32 * l_src = a+cas;  | 
216  |  | 
  | 
217  | 0  |     while (i--) { | 
218  | 0  |     *l_dest = *l_src;  | 
219  | 0  |     l_dest += x;  | 
220  | 0  |     l_src += 2;  | 
221  | 0  |     } /* b[i*x]=a[2*i+cas]; */  | 
222  |  | 
  | 
223  | 0  |   l_dest = b + sn * x;  | 
224  | 0  |   l_src = a + 1 - cas;  | 
225  |  |     | 
226  | 0  |   i = dn;  | 
227  | 0  |     while (i--) { | 
228  | 0  |     *l_dest = *l_src;  | 
229  | 0  |     l_dest += x;  | 
230  | 0  |     l_src += 2;  | 
231  | 0  |         } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/  | 
232  | 0  | }  | 
233  |  |  | 
234  |  | /* <summary>                             */  | 
235  |  | /* Inverse lazy transform (horizontal).  */  | 
236  |  | /* </summary>                            */  | 
237  | 0  | void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a) { | 
238  | 0  |     OPJ_INT32 *ai = a;  | 
239  | 0  |     OPJ_INT32 *bi = h->mem + h->cas;  | 
240  | 0  |     OPJ_INT32  i  = h->sn;  | 
241  | 0  |     while( i-- ) { | 
242  | 0  |       *bi = *(ai++);  | 
243  | 0  |     bi += 2;  | 
244  | 0  |     }  | 
245  | 0  |     ai  = a + h->sn;  | 
246  | 0  |     bi  = h->mem + 1 - h->cas;  | 
247  | 0  |     i = h->dn ;  | 
248  | 0  |     while( i-- ) { | 
249  | 0  |       *bi = *(ai++);  | 
250  | 0  |     bi += 2;  | 
251  | 0  |     }  | 
252  | 0  | }  | 
253  |  |  | 
254  |  | /* <summary>                             */    | 
255  |  | /* Inverse lazy transform (vertical).    */  | 
256  |  | /* </summary>                            */   | 
257  | 0  | void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) { | 
258  | 0  |     OPJ_INT32 *ai = a;  | 
259  | 0  |     OPJ_INT32 *bi = v->mem + v->cas;  | 
260  | 0  |     OPJ_INT32  i = v->sn;  | 
261  | 0  |     while( i-- ) { | 
262  | 0  |       *bi = *ai;  | 
263  | 0  |     bi += 2;  | 
264  | 0  |     ai += x;  | 
265  | 0  |     }  | 
266  | 0  |     ai = a + (v->sn * x);  | 
267  | 0  |     bi = v->mem + 1 - v->cas;  | 
268  | 0  |     i = v->dn ;  | 
269  | 0  |     while( i-- ) { | 
270  | 0  |       *bi = *ai;  | 
271  | 0  |     bi += 2;    | 
272  | 0  |     ai += x;  | 
273  | 0  |     }  | 
274  | 0  | }  | 
275  |  |  | 
276  |  |  | 
277  |  | /* <summary>                            */  | 
278  |  | /* Forward 5-3 wavelet transform in 1-D. */  | 
279  |  | /* </summary>                           */  | 
280  | 0  | void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { | 
281  | 0  |   OPJ_INT32 i;  | 
282  |  |     | 
283  | 0  |   if (!cas) { | 
284  | 0  |     if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */ | 
285  | 0  |       for (i = 0; i < dn; i++) OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;  | 
286  | 0  |       for (i = 0; i < sn; i++) OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;  | 
287  | 0  |     }  | 
288  | 0  |   } else { | 
289  | 0  |     if (!sn && dn == 1)       /* NEW :  CASE ONE ELEMENT */  | 
290  | 0  |       OPJ_S(0) *= 2;  | 
291  | 0  |     else { | 
292  | 0  |       for (i = 0; i < dn; i++) OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;  | 
293  | 0  |       for (i = 0; i < sn; i++) OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;  | 
294  | 0  |     }  | 
295  | 0  |   }  | 
296  | 0  | }  | 
297  |  |  | 
298  |  | /* <summary>                            */  | 
299  |  | /* Inverse 5-3 wavelet transform in 1-D. */  | 
300  |  | /* </summary>                           */   | 
301  | 0  | void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { | 
302  | 0  |   OPJ_INT32 i;  | 
303  |  |     | 
304  | 0  |   if (!cas) { | 
305  | 0  |     if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */ | 
306  | 0  |       for (i = 0; i < sn; i++) OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;  | 
307  | 0  |       for (i = 0; i < dn; i++) OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;  | 
308  | 0  |     }  | 
309  | 0  |   } else { | 
310  | 0  |     if (!sn  && dn == 1)          /* NEW :  CASE ONE ELEMENT */  | 
311  | 0  |       OPJ_S(0) /= 2;  | 
312  | 0  |     else { | 
313  | 0  |       for (i = 0; i < sn; i++) OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;  | 
314  | 0  |       for (i = 0; i < dn; i++) OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;  | 
315  | 0  |     }  | 
316  | 0  |   }  | 
317  | 0  | }  | 
318  |  |  | 
319  |  | /* <summary>                            */  | 
320  |  | /* Inverse 5-3 wavelet transform in 1-D. */  | 
321  |  | /* </summary>                           */   | 
322  | 0  | void opj_dwt_decode_1(opj_dwt_t *v) { | 
323  | 0  |   opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);  | 
324  | 0  | }  | 
325  |  |  | 
326  |  | /* <summary>                             */  | 
327  |  | /* Forward 9-7 wavelet transform in 1-D. */  | 
328  |  | /* </summary>                            */  | 
329  | 0  | void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { | 
330  | 0  |   OPJ_INT32 i;  | 
331  | 0  |   if (!cas) { | 
332  | 0  |     if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */ | 
333  | 0  |       for (i = 0; i < dn; i++)  | 
334  | 0  |         OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);  | 
335  | 0  |       for (i = 0; i < sn; i++)  | 
336  | 0  |         OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);  | 
337  | 0  |       for (i = 0; i < dn; i++)  | 
338  | 0  |         OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);  | 
339  | 0  |       for (i = 0; i < sn; i++)  | 
340  | 0  |         OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);  | 
341  | 0  |       for (i = 0; i < dn; i++)  | 
342  | 0  |         OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */  | 
343  | 0  |       for (i = 0; i < sn; i++)  | 
344  | 0  |         OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */  | 
345  | 0  |     }  | 
346  | 0  |   } else { | 
347  | 0  |     if ((sn > 0) || (dn > 1)) { /* NEW :  CASE ONE ELEMENT */ | 
348  | 0  |       for (i = 0; i < dn; i++)  | 
349  | 0  |         OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);  | 
350  | 0  |       for (i = 0; i < sn; i++)  | 
351  | 0  |         OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);  | 
352  | 0  |       for (i = 0; i < dn; i++)  | 
353  | 0  |         OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);  | 
354  | 0  |       for (i = 0; i < sn; i++)  | 
355  | 0  |         OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);  | 
356  | 0  |       for (i = 0; i < dn; i++)  | 
357  | 0  |         OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */  | 
358  | 0  |       for (i = 0; i < sn; i++)  | 
359  | 0  |         OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */  | 
360  | 0  |     }  | 
361  | 0  |   }  | 
362  | 0  | }  | 
363  |  |  | 
364  | 0  | void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize) { | 
365  | 0  |   OPJ_INT32 p, n;  | 
366  | 0  |   p = opj_int_floorlog2(stepsize) - 13;  | 
367  | 0  |   n = 11 - opj_int_floorlog2(stepsize);  | 
368  | 0  |   bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;  | 
369  | 0  |   bandno_stepsize->expn = numbps - p;  | 
370  | 0  | }  | 
371  |  |  | 
372  |  | /*   | 
373  |  | ==========================================================  | 
374  |  |    DWT interface  | 
375  |  | ==========================================================  | 
376  |  | */  | 
377  |  |  | 
378  |  |  | 
379  |  | /* <summary>                            */  | 
380  |  | /* Forward 5-3 wavelet transform in 2-D. */  | 
381  |  | /* </summary>                           */  | 
382  |  | INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) )  | 
383  | 0  | { | 
384  | 0  |   OPJ_INT32 i, j, k;  | 
385  | 0  |   OPJ_INT32 *a = 00;  | 
386  | 0  |   OPJ_INT32 *aj = 00;  | 
387  | 0  |   OPJ_INT32 *bj = 00;  | 
388  | 0  |   OPJ_INT32 w, l;  | 
389  |  | 
  | 
390  | 0  |   OPJ_INT32 rw;     /* width of the resolution level computed   */  | 
391  | 0  |   OPJ_INT32 rh;     /* height of the resolution level computed  */  | 
392  | 0  |   OPJ_UINT32 l_data_size;  | 
393  |  | 
  | 
394  | 0  |   opj_tcd_resolution_t * l_cur_res = 0;  | 
395  | 0  |   opj_tcd_resolution_t * l_last_res = 0;  | 
396  |  | 
  | 
397  | 0  |   w = tilec->x1-tilec->x0;  | 
398  | 0  |   l = (OPJ_INT32)tilec->numresolutions-1;  | 
399  | 0  |   a = tilec->data;  | 
400  |  | 
  | 
401  | 0  |   l_cur_res = tilec->resolutions + l;  | 
402  | 0  |   l_last_res = l_cur_res - 1;  | 
403  |  | 
  | 
404  | 0  |   l_data_size = opj_dwt_max_resolution( tilec->resolutions,tilec->numresolutions) * (OPJ_UINT32)sizeof(OPJ_INT32);  | 
405  | 0  |   bj = (OPJ_INT32*)opj_malloc((size_t)l_data_size);  | 
406  | 0  |   if (! bj) { | 
407  | 0  |     return OPJ_FALSE;  | 
408  | 0  |   }  | 
409  | 0  |   i = l;  | 
410  |  | 
  | 
411  | 0  |   while (i--) { | 
412  | 0  |     OPJ_INT32 rw1;    /* width of the resolution level once lower than computed one                                       */  | 
413  | 0  |     OPJ_INT32 rh1;    /* height of the resolution level once lower than computed one                                      */  | 
414  | 0  |     OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */  | 
415  | 0  |     OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */  | 
416  | 0  |     OPJ_INT32 dn, sn;  | 
417  |  | 
  | 
418  | 0  |     rw  = l_cur_res->x1 - l_cur_res->x0;  | 
419  | 0  |     rh  = l_cur_res->y1 - l_cur_res->y0;  | 
420  | 0  |     rw1 = l_last_res->x1 - l_last_res->x0;  | 
421  | 0  |     rh1 = l_last_res->y1 - l_last_res->y0;  | 
422  |  | 
  | 
423  | 0  |     cas_row = l_cur_res->x0 & 1;  | 
424  | 0  |     cas_col = l_cur_res->y0 & 1;  | 
425  |  | 
  | 
426  | 0  |     sn = rh1;  | 
427  | 0  |     dn = rh - rh1;  | 
428  | 0  |     for (j = 0; j < rw; ++j) { | 
429  | 0  |       aj = a + j;  | 
430  | 0  |       for (k = 0; k < rh; ++k) { | 
431  | 0  |         bj[k] = aj[k*w];  | 
432  | 0  |       }  | 
433  |  | 
  | 
434  | 0  |       (*p_function) (bj, dn, sn, cas_col);  | 
435  |  | 
  | 
436  | 0  |       opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);  | 
437  | 0  |     }  | 
438  |  | 
  | 
439  | 0  |     sn = rw1;  | 
440  | 0  |     dn = rw - rw1;  | 
441  |  | 
  | 
442  | 0  |     for (j = 0; j < rh; j++) { | 
443  | 0  |       aj = a + j * w;  | 
444  | 0  |       for (k = 0; k < rw; k++)  bj[k] = aj[k];  | 
445  | 0  |       (*p_function) (bj, dn, sn, cas_row);  | 
446  | 0  |       opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);  | 
447  | 0  |     }  | 
448  |  | 
  | 
449  | 0  |     l_cur_res = l_last_res;  | 
450  |  | 
  | 
451  | 0  |     --l_last_res;  | 
452  | 0  |   }  | 
453  |  | 
  | 
454  | 0  |   opj_free(bj);  | 
455  | 0  |   return OPJ_TRUE;  | 
456  | 0  | }  | 
457  |  |  | 
458  |  | /* Forward 5-3 wavelet transform in 2-D. */  | 
459  |  | /* </summary>                           */  | 
460  |  | OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)  | 
461  | 0  | { | 
462  | 0  |   return opj_dwt_encode_procedure(tilec,opj_dwt_encode_1);  | 
463  | 0  | }  | 
464  |  |  | 
465  |  | /* <summary>                            */  | 
466  |  | /* Inverse 5-3 wavelet transform in 2-D. */  | 
467  |  | /* </summary>                           */  | 
468  | 0  | OPJ_BOOL opj_dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) { | 
469  | 0  |   return opj_dwt_decode_tile(tilec, numres, &opj_dwt_decode_1);  | 
470  | 0  | }  | 
471  |  |  | 
472  |  |  | 
473  |  | /* <summary>                          */  | 
474  |  | /* Get gain of 5-3 wavelet transform. */  | 
475  |  | /* </summary>                         */  | 
476  | 0  | OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) { | 
477  | 0  |   if (orient == 0)  | 
478  | 0  |     return 0;  | 
479  | 0  |   if (orient == 1 || orient == 2)  | 
480  | 0  |     return 1;  | 
481  | 0  |   return 2;  | 
482  | 0  | }  | 
483  |  |  | 
484  |  | /* <summary>                */  | 
485  |  | /* Get norm of 5-3 wavelet. */  | 
486  |  | /* </summary>               */  | 
487  | 0  | OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) { | 
488  | 0  |   return opj_dwt_norms[orient][level];  | 
489  | 0  | }  | 
490  |  |  | 
491  |  | /* <summary>                             */  | 
492  |  | /* Forward 9-7 wavelet transform in 2-D. */  | 
493  |  | /* </summary>                            */  | 
494  |  | OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)  | 
495  | 0  | { | 
496  | 0  |   return opj_dwt_encode_procedure(tilec,opj_dwt_encode_1_real);  | 
497  | 0  | }  | 
498  |  |  | 
499  |  | /* <summary>                          */  | 
500  |  | /* Get gain of 9-7 wavelet transform. */  | 
501  |  | /* </summary>                         */  | 
502  | 0  | OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient) { | 
503  | 0  |   (void)orient;  | 
504  | 0  |   return 0;  | 
505  | 0  | }  | 
506  |  |  | 
507  |  | /* <summary>                */  | 
508  |  | /* Get norm of 9-7 wavelet. */  | 
509  |  | /* </summary>               */  | 
510  | 0  | OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) { | 
511  | 0  |   return opj_dwt_norms_real[orient][level];  | 
512  | 0  | }  | 
513  |  |  | 
514  | 0  | void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) { | 
515  | 0  |   OPJ_UINT32 numbands, bandno;  | 
516  | 0  |   numbands = 3 * tccp->numresolutions - 2;  | 
517  | 0  |   for (bandno = 0; bandno < numbands; bandno++) { | 
518  | 0  |     OPJ_FLOAT64 stepsize;  | 
519  | 0  |     OPJ_UINT32 resno, level, orient, gain;  | 
520  |  | 
  | 
521  | 0  |     resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);  | 
522  | 0  |     orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);  | 
523  | 0  |     level = tccp->numresolutions - 1 - resno;  | 
524  | 0  |     gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || (orient == 2)) ? 1 : 2));  | 
525  | 0  |     if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { | 
526  | 0  |       stepsize = 1.0;  | 
527  | 0  |     } else { | 
528  | 0  |       OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];  | 
529  | 0  |       stepsize = (1 << (gain)) / norm;  | 
530  | 0  |     }  | 
531  | 0  |     opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);  | 
532  | 0  |   }  | 
533  | 0  | }  | 
534  |  |  | 
535  |  | /* <summary>                             */  | 
536  |  | /* Determine maximum computed resolution level for inverse wavelet transform */  | 
537  |  | /* </summary>                            */  | 
538  | 0  | OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i) { | 
539  | 0  |   OPJ_UINT32 mr = 0;  | 
540  | 0  |   OPJ_UINT32 w;  | 
541  | 0  |   while( --i ) { | 
542  | 0  |     ++r;  | 
543  | 0  |     if( mr < ( w = (OPJ_UINT32)(r->x1 - r->x0) ) )  | 
544  | 0  |       mr = w ;  | 
545  | 0  |     if( mr < ( w = (OPJ_UINT32)(r->y1 - r->y0) ) )  | 
546  | 0  |       mr = w ;  | 
547  | 0  |   }  | 
548  | 0  |   return mr ;  | 
549  | 0  | }  | 
550  |  |  | 
551  |  | /* <summary>                            */  | 
552  |  | /* Inverse wavelet transform in 2-D.     */  | 
553  |  | /* </summary>                           */  | 
554  | 0  | OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) { | 
555  | 0  |   opj_dwt_t h;  | 
556  | 0  |   opj_dwt_t v;  | 
557  |  | 
  | 
558  | 0  |   opj_tcd_resolution_t* tr = tilec->resolutions;  | 
559  |  | 
  | 
560  | 0  |   OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - tr->x0);  /* width of the resolution level computed */  | 
561  | 0  |   OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - tr->y0);  /* height of the resolution level computed */  | 
562  |  | 
  | 
563  | 0  |   OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);  | 
564  |  | 
  | 
565  | 0  |   h.mem = (OPJ_INT32*)  | 
566  | 0  |   opj_aligned_malloc(opj_dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32));  | 
567  | 0  |   if (! h.mem){ | 
568  | 0  |     return OPJ_FALSE;  | 
569  | 0  |   }  | 
570  |  |  | 
571  | 0  |   v.mem = h.mem;  | 
572  |  | 
  | 
573  | 0  |   while( --numres) { | 
574  | 0  |     OPJ_INT32 * restrict tiledp = tilec->data;  | 
575  | 0  |     OPJ_UINT32 j;  | 
576  |  | 
  | 
577  | 0  |     ++tr;  | 
578  | 0  |     h.sn = (OPJ_INT32)rw;  | 
579  | 0  |     v.sn = (OPJ_INT32)rh;  | 
580  |  | 
  | 
581  | 0  |     rw = (OPJ_UINT32)(tr->x1 - tr->x0);  | 
582  | 0  |     rh = (OPJ_UINT32)(tr->y1 - tr->y0);  | 
583  |  | 
  | 
584  | 0  |     h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);  | 
585  | 0  |     h.cas = tr->x0 % 2;  | 
586  |  | 
  | 
587  | 0  |     for(j = 0; j < rh; ++j) { | 
588  | 0  |       opj_dwt_interleave_h(&h, &tiledp[j*w]);  | 
589  | 0  |       (dwt_1D)(&h);  | 
590  | 0  |       memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32));  | 
591  | 0  |     }  | 
592  |  | 
  | 
593  | 0  |     v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);  | 
594  | 0  |     v.cas = tr->y0 % 2;  | 
595  |  | 
  | 
596  | 0  |     for(j = 0; j < rw; ++j){ | 
597  | 0  |       OPJ_UINT32 k;  | 
598  | 0  |       opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w);  | 
599  | 0  |       (dwt_1D)(&v);  | 
600  | 0  |       for(k = 0; k < rh; ++k) { | 
601  | 0  |         tiledp[k * w + j] = v.mem[k];  | 
602  | 0  |       }  | 
603  | 0  |     }  | 
604  | 0  |   }  | 
605  | 0  |   opj_aligned_free(h.mem);  | 
606  | 0  |   return OPJ_TRUE;  | 
607  | 0  | }  | 
608  |  |  | 
609  | 0  | void opj_v4dwt_interleave_h(opj_v4dwt_t* restrict w, OPJ_FLOAT32* restrict a, OPJ_INT32 x, OPJ_INT32 size){ | 
610  | 0  |   OPJ_FLOAT32* restrict bi = (OPJ_FLOAT32*) (w->wavelet + w->cas);  | 
611  | 0  |   OPJ_INT32 count = w->sn;  | 
612  | 0  |   OPJ_INT32 i, k;  | 
613  |  | 
  | 
614  | 0  |   for(k = 0; k < 2; ++k){ | 
615  | 0  |     if ( count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0 ) { | 
616  |  |       /* Fast code path */  | 
617  | 0  |       for(i = 0; i < count; ++i){ | 
618  | 0  |         OPJ_INT32 j = i;  | 
619  | 0  |         bi[i*8    ] = a[j];  | 
620  | 0  |         j += x;  | 
621  | 0  |         bi[i*8 + 1] = a[j];  | 
622  | 0  |         j += x;  | 
623  | 0  |         bi[i*8 + 2] = a[j];  | 
624  | 0  |         j += x;  | 
625  | 0  |         bi[i*8 + 3] = a[j];  | 
626  | 0  |       }  | 
627  | 0  |     }  | 
628  | 0  |     else { | 
629  |  |       /* Slow code path */  | 
630  | 0  |       for(i = 0; i < count; ++i){ | 
631  | 0  |         OPJ_INT32 j = i;  | 
632  | 0  |         bi[i*8    ] = a[j];  | 
633  | 0  |         j += x;  | 
634  | 0  |         if(j >= size) continue;  | 
635  | 0  |         bi[i*8 + 1] = a[j];  | 
636  | 0  |         j += x;  | 
637  | 0  |         if(j >= size) continue;  | 
638  | 0  |         bi[i*8 + 2] = a[j];  | 
639  | 0  |         j += x;  | 
640  | 0  |         if(j >= size) continue;  | 
641  | 0  |         bi[i*8 + 3] = a[j]; /* This one*/  | 
642  | 0  |       }  | 
643  | 0  |     }  | 
644  |  | 
  | 
645  | 0  |     bi = (OPJ_FLOAT32*) (w->wavelet + 1 - w->cas);  | 
646  | 0  |     a += w->sn;  | 
647  | 0  |     size -= w->sn;  | 
648  | 0  |     count = w->dn;  | 
649  | 0  |   }  | 
650  | 0  | }  | 
651  |  |  | 
652  | 0  | void opj_v4dwt_interleave_v(opj_v4dwt_t* restrict v , OPJ_FLOAT32* restrict a , OPJ_INT32 x, OPJ_INT32 nb_elts_read){ | 
653  | 0  |   opj_v4_t* restrict bi = v->wavelet + v->cas;  | 
654  | 0  |   OPJ_INT32 i;  | 
655  |  | 
  | 
656  | 0  |   for(i = 0; i < v->sn; ++i){ | 
657  | 0  |     memcpy(&bi[i*2], &a[i*x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));  | 
658  | 0  |   }  | 
659  |  | 
  | 
660  | 0  |   a += v->sn * x;  | 
661  | 0  |   bi = v->wavelet + 1 - v->cas;  | 
662  |  | 
  | 
663  | 0  |   for(i = 0; i < v->dn; ++i){ | 
664  | 0  |     memcpy(&bi[i*2], &a[i*x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));  | 
665  | 0  |   }  | 
666  | 0  | }  | 
667  |  |  | 
668  |  | #ifdef __SSE__  | 
669  |  |  | 
670  | 0  | void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m128 c){ | 
671  | 0  |   __m128* restrict vw = (__m128*) w;  | 
672  | 0  |   OPJ_INT32 i;  | 
673  |  |   /* 4x unrolled loop */  | 
674  | 0  |   for(i = 0; i < count >> 2; ++i){ | 
675  | 0  |     *vw = _mm_mul_ps(*vw, c);  | 
676  | 0  |     vw += 2;  | 
677  | 0  |     *vw = _mm_mul_ps(*vw, c);  | 
678  | 0  |     vw += 2;  | 
679  | 0  |     *vw = _mm_mul_ps(*vw, c);  | 
680  | 0  |     vw += 2;  | 
681  | 0  |     *vw = _mm_mul_ps(*vw, c);  | 
682  | 0  |     vw += 2;  | 
683  | 0  |   }  | 
684  | 0  |   count &= 3;  | 
685  | 0  |   for(i = 0; i < count; ++i){ | 
686  | 0  |     *vw = _mm_mul_ps(*vw, c);  | 
687  | 0  |     vw += 2;  | 
688  | 0  |   }  | 
689  | 0  | }  | 
690  |  |  | 
691  | 0  | void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_INT32 m, __m128 c){ | 
692  | 0  |   __m128* restrict vl = (__m128*) l;  | 
693  | 0  |   __m128* restrict vw = (__m128*) w;  | 
694  | 0  |   OPJ_INT32 i;  | 
695  | 0  |   __m128 tmp1, tmp2, tmp3;  | 
696  | 0  |   tmp1 = vl[0];  | 
697  | 0  |   for(i = 0; i < m; ++i){ | 
698  | 0  |     tmp2 = vw[-1];  | 
699  | 0  |     tmp3 = vw[ 0];  | 
700  | 0  |     vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));  | 
701  | 0  |     tmp1 = tmp3;  | 
702  | 0  |     vw += 2;  | 
703  | 0  |   }  | 
704  | 0  |   vl = vw - 2;  | 
705  | 0  |   if(m >= k){ | 
706  | 0  |     return;  | 
707  | 0  |   }  | 
708  | 0  |   c = _mm_add_ps(c, c);  | 
709  | 0  |   c = _mm_mul_ps(c, vl[0]);  | 
710  | 0  |   for(; m < k; ++m){ | 
711  | 0  |     __m128 tmp = vw[-1];  | 
712  | 0  |     vw[-1] = _mm_add_ps(tmp, c);  | 
713  | 0  |     vw += 2;  | 
714  | 0  |   }  | 
715  | 0  | }  | 
716  |  |  | 
717  |  | #else  | 
718  |  |  | 
719  |  | void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count, const OPJ_FLOAT32 c)  | 
720  |  | { | 
721  |  |   OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;  | 
722  |  |   OPJ_INT32 i;  | 
723  |  |   for(i = 0; i < count; ++i){ | 
724  |  |     OPJ_FLOAT32 tmp1 = fw[i*8    ];  | 
725  |  |     OPJ_FLOAT32 tmp2 = fw[i*8 + 1];  | 
726  |  |     OPJ_FLOAT32 tmp3 = fw[i*8 + 2];  | 
727  |  |     OPJ_FLOAT32 tmp4 = fw[i*8 + 3];  | 
728  |  |     fw[i*8    ] = tmp1 * c;  | 
729  |  |     fw[i*8 + 1] = tmp2 * c;  | 
730  |  |     fw[i*8 + 2] = tmp3 * c;  | 
731  |  |     fw[i*8 + 3] = tmp4 * c;  | 
732  |  |   }  | 
733  |  | }  | 
734  |  |  | 
735  |  | void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_INT32 m, OPJ_FLOAT32 c)  | 
736  |  | { | 
737  |  |   OPJ_FLOAT32* restrict fl = (OPJ_FLOAT32*) l;  | 
738  |  |   OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;  | 
739  |  |   OPJ_INT32 i;  | 
740  |  |   for(i = 0; i < m; ++i){ | 
741  |  |     OPJ_FLOAT32 tmp1_1 = fl[0];  | 
742  |  |     OPJ_FLOAT32 tmp1_2 = fl[1];  | 
743  |  |     OPJ_FLOAT32 tmp1_3 = fl[2];  | 
744  |  |     OPJ_FLOAT32 tmp1_4 = fl[3];  | 
745  |  |     OPJ_FLOAT32 tmp2_1 = fw[-4];  | 
746  |  |     OPJ_FLOAT32 tmp2_2 = fw[-3];  | 
747  |  |     OPJ_FLOAT32 tmp2_3 = fw[-2];  | 
748  |  |     OPJ_FLOAT32 tmp2_4 = fw[-1];  | 
749  |  |     OPJ_FLOAT32 tmp3_1 = fw[0];  | 
750  |  |     OPJ_FLOAT32 tmp3_2 = fw[1];  | 
751  |  |     OPJ_FLOAT32 tmp3_3 = fw[2];  | 
752  |  |     OPJ_FLOAT32 tmp3_4 = fw[3];  | 
753  |  |     fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);  | 
754  |  |     fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);  | 
755  |  |     fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);  | 
756  |  |     fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);  | 
757  |  |     fl = fw;  | 
758  |  |     fw += 8;  | 
759  |  |   }  | 
760  |  |   if(m < k){ | 
761  |  |     OPJ_FLOAT32 c1;  | 
762  |  |     OPJ_FLOAT32 c2;  | 
763  |  |     OPJ_FLOAT32 c3;  | 
764  |  |     OPJ_FLOAT32 c4;  | 
765  |  |     c += c;  | 
766  |  |     c1 = fl[0] * c;  | 
767  |  |     c2 = fl[1] * c;  | 
768  |  |     c3 = fl[2] * c;  | 
769  |  |     c4 = fl[3] * c;  | 
770  |  |     for(; m < k; ++m){ | 
771  |  |       OPJ_FLOAT32 tmp1 = fw[-4];  | 
772  |  |       OPJ_FLOAT32 tmp2 = fw[-3];  | 
773  |  |       OPJ_FLOAT32 tmp3 = fw[-2];  | 
774  |  |       OPJ_FLOAT32 tmp4 = fw[-1];  | 
775  |  |       fw[-4] = tmp1 + c1;  | 
776  |  |       fw[-3] = tmp2 + c2;  | 
777  |  |       fw[-2] = tmp3 + c3;  | 
778  |  |       fw[-1] = tmp4 + c4;  | 
779  |  |       fw += 8;  | 
780  |  |     }  | 
781  |  |   }  | 
782  |  | }  | 
783  |  |  | 
784  |  | #endif  | 
785  |  |  | 
786  |  | /* <summary>                             */  | 
787  |  | /* Inverse 9-7 wavelet transform in 1-D. */  | 
788  |  | /* </summary>                            */  | 
789  |  | void opj_v4dwt_decode(opj_v4dwt_t* restrict dwt)  | 
790  | 0  | { | 
791  | 0  |   OPJ_INT32 a, b;  | 
792  | 0  |   if(dwt->cas == 0) { | 
793  | 0  |     if(!((dwt->dn > 0) || (dwt->sn > 1))){ | 
794  | 0  |       return;  | 
795  | 0  |     }  | 
796  | 0  |     a = 0;  | 
797  | 0  |     b = 1;  | 
798  | 0  |   }else{ | 
799  | 0  |     if(!((dwt->sn > 0) || (dwt->dn > 1))) { | 
800  | 0  |       return;  | 
801  | 0  |     }  | 
802  | 0  |     a = 1;  | 
803  | 0  |     b = 0;  | 
804  | 0  |   }  | 
805  | 0  | #ifdef __SSE__  | 
806  | 0  |   opj_v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(opj_K));  | 
807  | 0  |   opj_v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(opj_c13318));  | 
808  | 0  |   opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_delta));  | 
809  | 0  |   opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, opj_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_gamma));  | 
810  | 0  |   opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_beta));  | 
811  | 0  |   opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, opj_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_alpha));  | 
812  |  | #else  | 
813  |  |   opj_v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, opj_K);  | 
814  |  |   opj_v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, opj_c13318);  | 
815  |  |   opj_v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_int_min(dwt->sn, dwt->dn-a), opj_dwt_delta);  | 
816  |  |   opj_v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, opj_int_min(dwt->dn, dwt->sn-b), opj_dwt_gamma);  | 
817  |  |   opj_v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_int_min(dwt->sn, dwt->dn-a), opj_dwt_beta);  | 
818  |  |   opj_v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, opj_int_min(dwt->dn, dwt->sn-b), opj_dwt_alpha);  | 
819  |  | #endif  | 
820  | 0  | }  | 
821  |  |  | 
822  |  |  | 
823  |  | /* <summary>                             */  | 
824  |  | /* Inverse 9-7 wavelet transform in 2-D. */  | 
825  |  | /* </summary>                            */  | 
826  |  | OPJ_BOOL opj_dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, OPJ_UINT32 numres)  | 
827  | 0  | { | 
828  | 0  |   opj_v4dwt_t h;  | 
829  | 0  |   opj_v4dwt_t v;  | 
830  |  | 
  | 
831  | 0  |   opj_tcd_resolution_t* res = tilec->resolutions;  | 
832  |  | 
  | 
833  | 0  |   OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 - res->x0);  /* width of the resolution level computed */  | 
834  | 0  |   OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 - res->y0);  /* height of the resolution level computed */  | 
835  |  | 
  | 
836  | 0  |   OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);  | 
837  |  | 
  | 
838  | 0  |   h.wavelet = (opj_v4_t*) opj_aligned_malloc((opj_dwt_max_resolution(res, numres)+5) * sizeof(opj_v4_t));  | 
839  | 0  |   v.wavelet = h.wavelet;  | 
840  |  | 
  | 
841  | 0  |   while( --numres) { | 
842  | 0  |     OPJ_FLOAT32 * restrict aj = (OPJ_FLOAT32*) tilec->data;  | 
843  | 0  |     OPJ_UINT32 bufsize = (OPJ_UINT32)((tilec->x1 - tilec->x0) * (tilec->y1 - tilec->y0));  | 
844  | 0  |     OPJ_INT32 j;  | 
845  |  | 
  | 
846  | 0  |     h.sn = (OPJ_INT32)rw;  | 
847  | 0  |     v.sn = (OPJ_INT32)rh;  | 
848  |  | 
  | 
849  | 0  |     ++res;  | 
850  |  | 
  | 
851  | 0  |     rw = (OPJ_UINT32)(res->x1 - res->x0); /* width of the resolution level computed */  | 
852  | 0  |     rh = (OPJ_UINT32)(res->y1 - res->y0); /* height of the resolution level computed */  | 
853  |  | 
  | 
854  | 0  |     h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);  | 
855  | 0  |     h.cas = res->x0 % 2;  | 
856  |  | 
  | 
857  | 0  |     for(j = (OPJ_INT32)rh; j > 3; j -= 4) { | 
858  | 0  |       OPJ_INT32 k;  | 
859  | 0  |       opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);  | 
860  | 0  |       opj_v4dwt_decode(&h);  | 
861  |  | 
  | 
862  | 0  |       for(k = (OPJ_INT32)rw; --k >= 0;){ | 
863  | 0  |         aj[k               ] = h.wavelet[k].f[0];  | 
864  | 0  |         aj[k+(OPJ_INT32)w  ] = h.wavelet[k].f[1];  | 
865  | 0  |         aj[k+(OPJ_INT32)w*2] = h.wavelet[k].f[2];  | 
866  | 0  |         aj[k+(OPJ_INT32)w*3] = h.wavelet[k].f[3];  | 
867  | 0  |       }  | 
868  |  | 
  | 
869  | 0  |       aj += w*4;  | 
870  | 0  |       bufsize -= w*4;  | 
871  | 0  |     }  | 
872  |  | 
  | 
873  | 0  |     if (rh & 0x03) { | 
874  | 0  |       OPJ_INT32 k;  | 
875  | 0  |       j = rh & 0x03;  | 
876  | 0  |       opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);  | 
877  | 0  |       opj_v4dwt_decode(&h);  | 
878  | 0  |       for(k = (OPJ_INT32)rw; --k >= 0;){ | 
879  | 0  |         switch(j) { | 
880  | 0  |           case 3: aj[k+(OPJ_INT32)w*2] = h.wavelet[k].f[2];  | 
881  | 0  |           case 2: aj[k+(OPJ_INT32)w  ] = h.wavelet[k].f[1];  | 
882  | 0  |           case 1: aj[k               ] = h.wavelet[k].f[0];  | 
883  | 0  |         }  | 
884  | 0  |       }  | 
885  | 0  |     }  | 
886  |  | 
  | 
887  | 0  |     v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);  | 
888  | 0  |     v.cas = res->y0 % 2;  | 
889  |  | 
  | 
890  | 0  |     aj = (OPJ_FLOAT32*) tilec->data;  | 
891  | 0  |     for(j = (OPJ_INT32)rw; j > 3; j -= 4){ | 
892  | 0  |       OPJ_UINT32 k;  | 
893  |  | 
  | 
894  | 0  |       opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, 4);  | 
895  | 0  |       opj_v4dwt_decode(&v);  | 
896  |  | 
  | 
897  | 0  |       for(k = 0; k < rh; ++k){ | 
898  | 0  |         memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));  | 
899  | 0  |       }  | 
900  | 0  |       aj += 4;  | 
901  | 0  |     }  | 
902  |  | 
  | 
903  | 0  |     if (rw & 0x03){ | 
904  | 0  |       OPJ_UINT32 k;  | 
905  |  | 
  | 
906  | 0  |       j = rw & 0x03;  | 
907  |  | 
  | 
908  | 0  |       opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, j);  | 
909  | 0  |       opj_v4dwt_decode(&v);  | 
910  |  | 
  | 
911  | 0  |       for(k = 0; k < rh; ++k){ | 
912  | 0  |         memcpy(&aj[k*w], &v.wavelet[k], (size_t)j * sizeof(OPJ_FLOAT32));  | 
913  | 0  |       }  | 
914  | 0  |     }  | 
915  | 0  |   }  | 
916  |  | 
  | 
917  | 0  |   opj_aligned_free(h.wavelet);  | 
918  | 0  |   return OPJ_TRUE;  | 
919  | 0  | }  |