/src/mozilla-central/gfx/qcms/transform-sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <emmintrin.h> |
2 | | |
3 | | #include "qcmsint.h" |
4 | | |
5 | | /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ |
6 | | #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
7 | | #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) |
8 | | static const ALIGN float floatScaleX4[4] = |
9 | | { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; |
10 | | static const ALIGN float clampMaxValueX4[4] = |
11 | | { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; |
12 | | |
13 | | void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
14 | | unsigned char *src, |
15 | | unsigned char *dest, |
16 | | size_t length) |
17 | 0 | { |
18 | 0 | unsigned int i; |
19 | 0 | float (*mat)[4] = transform->matrix; |
20 | 0 | char input_back[32]; |
21 | 0 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
22 | 0 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
23 | 0 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
24 | 0 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
25 | 0 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
26 | 0 | /* share input and output locations to save having to keep the |
27 | 0 | * locations in separate registers */ |
28 | 0 | uint32_t const * output = (uint32_t*)input; |
29 | 0 |
|
30 | 0 | /* deref *transform now to avoid it in loop */ |
31 | 0 | const float *igtbl_r = transform->input_gamma_table_r; |
32 | 0 | const float *igtbl_g = transform->input_gamma_table_g; |
33 | 0 | const float *igtbl_b = transform->input_gamma_table_b; |
34 | 0 |
|
35 | 0 | /* deref *transform now to avoid it in loop */ |
36 | 0 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
37 | 0 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
38 | 0 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
39 | 0 |
|
40 | 0 | /* input matrix values never change */ |
41 | 0 | const __m128 mat0 = _mm_load_ps(mat[0]); |
42 | 0 | const __m128 mat1 = _mm_load_ps(mat[1]); |
43 | 0 | const __m128 mat2 = _mm_load_ps(mat[2]); |
44 | 0 |
|
45 | 0 | /* these values don't change, either */ |
46 | 0 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
47 | 0 | const __m128 min = _mm_setzero_ps(); |
48 | 0 | const __m128 scale = _mm_load_ps(floatScaleX4); |
49 | 0 |
|
50 | 0 | /* working variables */ |
51 | 0 | __m128 vec_r, vec_g, vec_b, result; |
52 | 0 |
|
53 | 0 | /* CYA */ |
54 | 0 | if (!length) |
55 | 0 | return; |
56 | 0 | |
57 | 0 | /* one pixel is handled outside of the loop */ |
58 | 0 | length--; |
59 | 0 |
|
60 | 0 | /* setup for transforming 1st pixel */ |
61 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
62 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
63 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
64 | 0 | src += 3; |
65 | 0 |
|
66 | 0 | /* transform all but final pixel */ |
67 | 0 |
|
68 | 0 | for (i=0; i<length; i++) |
69 | 0 | { |
70 | 0 | /* position values from gamma tables */ |
71 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
72 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
73 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
74 | 0 |
|
75 | 0 | /* gamma * matrix */ |
76 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
77 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
78 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
79 | 0 |
|
80 | 0 | /* crunch, crunch, crunch */ |
81 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
82 | 0 | vec_r = _mm_max_ps(min, vec_r); |
83 | 0 | vec_r = _mm_min_ps(max, vec_r); |
84 | 0 | result = _mm_mul_ps(vec_r, scale); |
85 | 0 |
|
86 | 0 | /* store calc'd output tables indices */ |
87 | 0 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
88 | 0 |
|
89 | 0 | /* load for next loop while store completes */ |
90 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
91 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
92 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
93 | 0 | src += 3; |
94 | 0 |
|
95 | 0 | /* use calc'd indices to output RGB values */ |
96 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
97 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
98 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
99 | 0 | dest += RGB_OUTPUT_COMPONENTS; |
100 | 0 | } |
101 | 0 |
|
102 | 0 | /* handle final (maybe only) pixel */ |
103 | 0 |
|
104 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
105 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
106 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
107 | 0 |
|
108 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
109 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
110 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
111 | 0 |
|
112 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
113 | 0 | vec_r = _mm_max_ps(min, vec_r); |
114 | 0 | vec_r = _mm_min_ps(max, vec_r); |
115 | 0 | result = _mm_mul_ps(vec_r, scale); |
116 | 0 |
|
117 | 0 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
118 | 0 |
|
119 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
120 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
121 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
122 | 0 | } |
123 | | |
124 | | void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
125 | | unsigned char *src, |
126 | | unsigned char *dest, |
127 | | size_t length) |
128 | 0 | { |
129 | 0 | unsigned int i; |
130 | 0 | float (*mat)[4] = transform->matrix; |
131 | 0 | char input_back[32]; |
132 | 0 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
133 | 0 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
134 | 0 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
135 | 0 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
136 | 0 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
137 | 0 | /* share input and output locations to save having to keep the |
138 | 0 | * locations in separate registers */ |
139 | 0 | uint32_t const * output = (uint32_t*)input; |
140 | 0 |
|
141 | 0 | /* deref *transform now to avoid it in loop */ |
142 | 0 | const float *igtbl_r = transform->input_gamma_table_r; |
143 | 0 | const float *igtbl_g = transform->input_gamma_table_g; |
144 | 0 | const float *igtbl_b = transform->input_gamma_table_b; |
145 | 0 |
|
146 | 0 | /* deref *transform now to avoid it in loop */ |
147 | 0 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
148 | 0 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
149 | 0 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
150 | 0 |
|
151 | 0 | /* input matrix values never change */ |
152 | 0 | const __m128 mat0 = _mm_load_ps(mat[0]); |
153 | 0 | const __m128 mat1 = _mm_load_ps(mat[1]); |
154 | 0 | const __m128 mat2 = _mm_load_ps(mat[2]); |
155 | 0 |
|
156 | 0 | /* these values don't change, either */ |
157 | 0 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
158 | 0 | const __m128 min = _mm_setzero_ps(); |
159 | 0 | const __m128 scale = _mm_load_ps(floatScaleX4); |
160 | 0 |
|
161 | 0 | /* working variables */ |
162 | 0 | __m128 vec_r, vec_g, vec_b, result; |
163 | 0 | unsigned char alpha; |
164 | 0 |
|
165 | 0 | /* CYA */ |
166 | 0 | if (!length) |
167 | 0 | return; |
168 | 0 | |
169 | 0 | /* one pixel is handled outside of the loop */ |
170 | 0 | length--; |
171 | 0 |
|
172 | 0 | /* setup for transforming 1st pixel */ |
173 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
174 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
175 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
176 | 0 | alpha = src[3]; |
177 | 0 | src += 4; |
178 | 0 |
|
179 | 0 | /* transform all but final pixel */ |
180 | 0 |
|
181 | 0 | for (i=0; i<length; i++) |
182 | 0 | { |
183 | 0 | /* position values from gamma tables */ |
184 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
185 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
186 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
187 | 0 |
|
188 | 0 | /* gamma * matrix */ |
189 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
190 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
191 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
192 | 0 |
|
193 | 0 | /* store alpha for this pixel; load alpha for next */ |
194 | 0 | dest[OUTPUT_A_INDEX] = alpha; |
195 | 0 | alpha = src[3]; |
196 | 0 |
|
197 | 0 | /* crunch, crunch, crunch */ |
198 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
199 | 0 | vec_r = _mm_max_ps(min, vec_r); |
200 | 0 | vec_r = _mm_min_ps(max, vec_r); |
201 | 0 | result = _mm_mul_ps(vec_r, scale); |
202 | 0 |
|
203 | 0 | /* store calc'd output tables indices */ |
204 | 0 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
205 | 0 |
|
206 | 0 | /* load gamma values for next loop while store completes */ |
207 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
208 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
209 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
210 | 0 | src += 4; |
211 | 0 |
|
212 | 0 | /* use calc'd indices to output RGB values */ |
213 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
214 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
215 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
216 | 0 | dest += RGBA_OUTPUT_COMPONENTS; |
217 | 0 | } |
218 | 0 |
|
219 | 0 | /* handle final (maybe only) pixel */ |
220 | 0 |
|
221 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
222 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
223 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
224 | 0 |
|
225 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
226 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
227 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
228 | 0 |
|
229 | 0 | dest[OUTPUT_A_INDEX] = alpha; |
230 | 0 |
|
231 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
232 | 0 | vec_r = _mm_max_ps(min, vec_r); |
233 | 0 | vec_r = _mm_min_ps(max, vec_r); |
234 | 0 | result = _mm_mul_ps(vec_r, scale); |
235 | 0 |
|
236 | 0 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
237 | 0 |
|
238 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
239 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
240 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
241 | 0 | } |
242 | | |
243 | | |