/src/mozilla-central/gfx/qcms/transform-sse1.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <xmmintrin.h> |
2 | | |
3 | | #include "qcmsint.h" |
4 | | |
5 | | /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ |
6 | | #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
7 | | #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) |
8 | | static const ALIGN float floatScaleX4[4] = |
9 | | { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; |
10 | | static const ALIGN float clampMaxValueX4[4] = |
11 | | { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; |
12 | | |
13 | | void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform, |
14 | | unsigned char *src, |
15 | | unsigned char *dest, |
16 | | size_t length) |
17 | 0 | { |
18 | 0 | unsigned int i; |
19 | 0 | float (*mat)[4] = transform->matrix; |
20 | 0 | char input_back[32]; |
21 | 0 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
22 | 0 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
23 | 0 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
24 | 0 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
25 | 0 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
26 | 0 | /* share input and output locations to save having to keep the |
27 | 0 | * locations in separate registers */ |
28 | 0 | uint32_t const * output = (uint32_t*)input; |
29 | 0 |
|
30 | 0 | /* deref *transform now to avoid it in loop */ |
31 | 0 | const float *igtbl_r = transform->input_gamma_table_r; |
32 | 0 | const float *igtbl_g = transform->input_gamma_table_g; |
33 | 0 | const float *igtbl_b = transform->input_gamma_table_b; |
34 | 0 |
|
35 | 0 | /* deref *transform now to avoid it in loop */ |
36 | 0 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
37 | 0 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
38 | 0 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
39 | 0 |
|
40 | 0 | /* input matrix values never change */ |
41 | 0 | const __m128 mat0 = _mm_load_ps(mat[0]); |
42 | 0 | const __m128 mat1 = _mm_load_ps(mat[1]); |
43 | 0 | const __m128 mat2 = _mm_load_ps(mat[2]); |
44 | 0 |
|
45 | 0 | /* these values don't change, either */ |
46 | 0 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
47 | 0 | const __m128 min = _mm_setzero_ps(); |
48 | 0 | const __m128 scale = _mm_load_ps(floatScaleX4); |
49 | 0 |
|
50 | 0 | /* working variables */ |
51 | 0 | __m128 vec_r, vec_g, vec_b, result; |
52 | 0 |
|
53 | 0 | /* CYA */ |
54 | 0 | if (!length) |
55 | 0 | return; |
56 | 0 | |
57 | 0 | /* one pixel is handled outside of the loop */ |
58 | 0 | length--; |
59 | 0 |
|
60 | 0 | /* setup for transforming 1st pixel */ |
61 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
62 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
63 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
64 | 0 | src += 3; |
65 | 0 |
|
66 | 0 | /* transform all but final pixel */ |
67 | 0 |
|
68 | 0 | for (i=0; i<length; i++) |
69 | 0 | { |
70 | 0 | /* position values from gamma tables */ |
71 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
72 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
73 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
74 | 0 |
|
75 | 0 | /* gamma * matrix */ |
76 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
77 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
78 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
79 | 0 |
|
80 | 0 | /* crunch, crunch, crunch */ |
81 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
82 | 0 | vec_r = _mm_max_ps(min, vec_r); |
83 | 0 | vec_r = _mm_min_ps(max, vec_r); |
84 | 0 | result = _mm_mul_ps(vec_r, scale); |
85 | 0 |
|
86 | 0 | /* store calc'd output tables indices */ |
87 | 0 | *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); |
88 | 0 | result = _mm_movehl_ps(result, result); |
89 | 0 | *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ; |
90 | 0 |
|
91 | 0 | /* load for next loop while store completes */ |
92 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
93 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
94 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
95 | 0 | src += 3; |
96 | 0 |
|
97 | 0 | /* use calc'd indices to output RGB values */ |
98 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
99 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
100 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
101 | 0 | dest += RGB_OUTPUT_COMPONENTS; |
102 | 0 | } |
103 | 0 |
|
104 | 0 | /* handle final (maybe only) pixel */ |
105 | 0 |
|
106 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
107 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
108 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
109 | 0 |
|
110 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
111 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
112 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
113 | 0 |
|
114 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
115 | 0 | vec_r = _mm_max_ps(min, vec_r); |
116 | 0 | vec_r = _mm_min_ps(max, vec_r); |
117 | 0 | result = _mm_mul_ps(vec_r, scale); |
118 | 0 |
|
119 | 0 | *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); |
120 | 0 | result = _mm_movehl_ps(result, result); |
121 | 0 | *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); |
122 | 0 |
|
123 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
124 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
125 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
126 | 0 |
|
127 | 0 | _mm_empty(); |
128 | 0 | } |
129 | | |
130 | | void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform, |
131 | | unsigned char *src, |
132 | | unsigned char *dest, |
133 | | size_t length) |
134 | 0 | { |
135 | 0 | unsigned int i; |
136 | 0 | float (*mat)[4] = transform->matrix; |
137 | 0 | char input_back[32]; |
138 | 0 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
139 | 0 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
140 | 0 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
141 | 0 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
142 | 0 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
143 | 0 | /* share input and output locations to save having to keep the |
144 | 0 | * locations in separate registers */ |
145 | 0 | uint32_t const * output = (uint32_t*)input; |
146 | 0 |
|
147 | 0 | /* deref *transform now to avoid it in loop */ |
148 | 0 | const float *igtbl_r = transform->input_gamma_table_r; |
149 | 0 | const float *igtbl_g = transform->input_gamma_table_g; |
150 | 0 | const float *igtbl_b = transform->input_gamma_table_b; |
151 | 0 |
|
152 | 0 | /* deref *transform now to avoid it in loop */ |
153 | 0 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
154 | 0 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
155 | 0 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
156 | 0 |
|
157 | 0 | /* input matrix values never change */ |
158 | 0 | const __m128 mat0 = _mm_load_ps(mat[0]); |
159 | 0 | const __m128 mat1 = _mm_load_ps(mat[1]); |
160 | 0 | const __m128 mat2 = _mm_load_ps(mat[2]); |
161 | 0 |
|
162 | 0 | /* these values don't change, either */ |
163 | 0 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
164 | 0 | const __m128 min = _mm_setzero_ps(); |
165 | 0 | const __m128 scale = _mm_load_ps(floatScaleX4); |
166 | 0 |
|
167 | 0 | /* working variables */ |
168 | 0 | __m128 vec_r, vec_g, vec_b, result; |
169 | 0 | unsigned char alpha; |
170 | 0 |
|
171 | 0 | /* CYA */ |
172 | 0 | if (!length) |
173 | 0 | return; |
174 | 0 | |
175 | 0 | /* one pixel is handled outside of the loop */ |
176 | 0 | length--; |
177 | 0 |
|
178 | 0 | /* setup for transforming 1st pixel */ |
179 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
180 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
181 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
182 | 0 | alpha = src[3]; |
183 | 0 | src += 4; |
184 | 0 |
|
185 | 0 | /* transform all but final pixel */ |
186 | 0 |
|
187 | 0 | for (i=0; i<length; i++) |
188 | 0 | { |
189 | 0 | /* position values from gamma tables */ |
190 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
191 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
192 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
193 | 0 |
|
194 | 0 | /* gamma * matrix */ |
195 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
196 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
197 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
198 | 0 |
|
199 | 0 | /* store alpha for this pixel; load alpha for next */ |
200 | 0 | dest[OUTPUT_A_INDEX] = alpha; |
201 | 0 | alpha = src[3]; |
202 | 0 |
|
203 | 0 | /* crunch, crunch, crunch */ |
204 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
205 | 0 | vec_r = _mm_max_ps(min, vec_r); |
206 | 0 | vec_r = _mm_min_ps(max, vec_r); |
207 | 0 | result = _mm_mul_ps(vec_r, scale); |
208 | 0 |
|
209 | 0 | /* store calc'd output tables indices */ |
210 | 0 | *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); |
211 | 0 | result = _mm_movehl_ps(result, result); |
212 | 0 | *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); |
213 | 0 |
|
214 | 0 | /* load gamma values for next loop while store completes */ |
215 | 0 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
216 | 0 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
217 | 0 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
218 | 0 | src += 4; |
219 | 0 |
|
220 | 0 | /* use calc'd indices to output RGB values */ |
221 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
222 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
223 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
224 | 0 | dest += 4; |
225 | 0 | } |
226 | 0 |
|
227 | 0 | /* handle final (maybe only) pixel */ |
228 | 0 |
|
229 | 0 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
230 | 0 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
231 | 0 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
232 | 0 |
|
233 | 0 | vec_r = _mm_mul_ps(vec_r, mat0); |
234 | 0 | vec_g = _mm_mul_ps(vec_g, mat1); |
235 | 0 | vec_b = _mm_mul_ps(vec_b, mat2); |
236 | 0 |
|
237 | 0 | dest[OUTPUT_A_INDEX] = alpha; |
238 | 0 |
|
239 | 0 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
240 | 0 | vec_r = _mm_max_ps(min, vec_r); |
241 | 0 | vec_r = _mm_min_ps(max, vec_r); |
242 | 0 | result = _mm_mul_ps(vec_r, scale); |
243 | 0 |
|
244 | 0 | *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); |
245 | 0 | result = _mm_movehl_ps(result, result); |
246 | 0 | *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); |
247 | 0 |
|
248 | 0 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
249 | 0 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
250 | 0 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
251 | 0 |
|
252 | 0 | _mm_empty(); |
253 | 0 | } |