/src/ghostpdl/base/gxht_thresh.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (C) 2001-2023 Artifex Software, Inc. |
2 | | All Rights Reserved. |
3 | | |
4 | | This software is provided AS-IS with no warranty, either express or |
5 | | implied. |
6 | | |
7 | | This software is distributed under license and may not be copied, |
8 | | modified or distributed except as expressly authorized under the terms |
9 | | of the license contained in the file LICENSE in this distribution. |
10 | | |
11 | | Refer to licensing information at http://www.artifex.com or contact |
12 | | Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
13 | | CA 94129, USA, for further information. |
14 | | */ |
15 | | |
16 | | |
17 | | /*$Id: gxhts_thresh.c $ */ |
18 | | /* Halftone thresholding code */ |
19 | | |
20 | | #include <stdlib.h> /* abs() */ |
21 | | #include "memory_.h" |
22 | | #include "gx.h" |
23 | | #include "gxgstate.h" |
24 | | #include "gsiparam.h" |
25 | | #include "math_.h" |
26 | | #include "gxfixed.h" /* needed for gximage.h */ |
27 | | #include "gximage.h" |
28 | | #include "gxdevice.h" |
29 | | #include "gxdht.h" |
30 | | #include "gxht_thresh.h" |
31 | | #include "gzht.h" |
32 | | #include "gxdevsop.h" |
33 | | |
34 | | /* Enable the following define to perform a little extra work to stop |
35 | | * spurious valgrind errors. The code should perform perfectly even without |
36 | | * this enabled, but enabling it makes debugging much easier. |
37 | | */ |
38 | | /* #define PACIFY_VALGRIND */ |
39 | | |
40 | | #ifndef __WIN32__ |
41 | 1 | #define __align16 __attribute__((aligned(16))) |
42 | | #else |
43 | | #define __align16 __declspec(align(16)) |
44 | | #endif |
45 | 1.95M | #define fastfloor(x) (((int)(x)) - (((x)<0) && ((x) != (float)(int)(x)))) |
46 | | |
47 | | #ifdef HAVE_SSE2 |
48 | | |
49 | | #include <emmintrin.h> |
50 | | |
51 | | static const byte bitreverse[] = |
52 | | { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, |
53 | | 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, |
54 | | 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, |
55 | | 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, |
56 | | 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, |
57 | | 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, |
58 | | 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, |
59 | | 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, |
60 | | 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, |
61 | | 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, |
62 | | 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, |
63 | | 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, |
64 | | 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, |
65 | | 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, |
66 | | 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, |
67 | | 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, |
68 | | 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, |
69 | | 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, |
70 | | 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, |
71 | | 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, |
72 | | 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, |
73 | | 0x3F, 0xBF, 0x7F, 0xFF}; |
74 | | #endif |
75 | | |
76 | | #if RAW_HT_DUMP |
77 | | /* This is slow thresholding, byte output for debug only */ |
78 | | void |
79 | | gx_ht_threshold_row_byte(byte *contone, byte *threshold_strip, int contone_stride, |
80 | | byte *halftone, int dithered_stride, int width, |
81 | | int num_rows) |
82 | | { |
83 | | int k, j; |
84 | | byte *contone_ptr; |
85 | | byte *thresh_ptr; |
86 | | byte *halftone_ptr; |
87 | | |
88 | | /* For the moment just do a very slow compare until we get |
89 | | get this working */ |
90 | | for (j = 0; j < num_rows; j++) { |
91 | | contone_ptr = contone; |
92 | | thresh_ptr = threshold_strip + contone_stride * j; |
93 | | halftone_ptr = halftone + dithered_stride * j; |
94 | | for (k = 0; k < width; k++) { |
95 | | if (contone_ptr[k] < thresh_ptr[k]) { |
96 | | halftone_ptr[k] = 0; |
97 | | } else { |
98 | | halftone_ptr[k] = 255; |
99 | | } |
100 | | } |
101 | | } |
102 | | } |
103 | | #endif |
104 | | |
105 | | #ifndef HAVE_SSE2 |
106 | | /* A simple case for use in the landscape mode. Could probably be coded up |
107 | | faster */ |
108 | | static void |
109 | | threshold_16_bit(byte *contone_ptr, byte *thresh_ptr, byte *ht_data) |
110 | | { |
111 | | int j; |
112 | | |
113 | | for (j = 2; j > 0; j--) { |
114 | | byte h = 0; |
115 | | byte bit_init = 0x80; |
116 | | do { |
117 | | if (*contone_ptr++ < *thresh_ptr++) { |
118 | | h |= bit_init; |
119 | | } |
120 | | bit_init >>= 1; |
121 | | } while (bit_init != 0); |
122 | | *ht_data++ = h; |
123 | | } |
124 | | } |
125 | | #else |
126 | | /* Note this function has strict data alignment needs */ |
127 | | static void |
128 | | threshold_16_SSE(byte *contone_ptr, byte *thresh_ptr, byte *ht_data) |
129 | 279M | { |
130 | 279M | __m128i input1; |
131 | 279M | __m128i input2; |
132 | 279M | register int result_int; |
133 | 279M | const unsigned int mask1 = 0x80808080; |
134 | 279M | __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1); |
135 | | |
136 | | /* Load */ |
137 | 279M | input1 = _mm_load_si128((const __m128i *)contone_ptr); |
138 | 279M | input2 = _mm_load_si128((const __m128i *) thresh_ptr); |
139 | | /* Unsigned subtraction does Unsigned saturation so we |
140 | | have to use the signed operation */ |
141 | 279M | input1 = _mm_xor_si128(input1, sign_fix); |
142 | 279M | input2 = _mm_xor_si128(input2, sign_fix); |
143 | | /* Subtract the two */ |
144 | 279M | input2 = _mm_subs_epi8(input1, input2); |
145 | | /* Grab the sign mask */ |
146 | 279M | result_int = _mm_movemask_epi8(input2); |
147 | | /* bit wise reversal on 16 bit word */ |
148 | 279M | ht_data[0] = bitreverse[(result_int & 0xff)]; |
149 | 279M | ht_data[1] = bitreverse[((result_int >> 8) & 0xff)]; |
150 | 279M | } |
151 | | |
152 | | /* Not so fussy on its alignment */ |
153 | | static void |
154 | | threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data) |
155 | 1.65M | { |
156 | 1.65M | __m128i input1; |
157 | 1.65M | __m128i input2; |
158 | 1.65M | int result_int; |
159 | 1.65M | byte *sse_data; |
160 | 1.65M | const unsigned int mask1 = 0x80808080; |
161 | 1.65M | __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1); |
162 | | |
163 | 1.65M | sse_data = (byte*) &(result_int); |
164 | | /* Load */ |
165 | 1.65M | input1 = _mm_loadu_si128((const __m128i *)contone_ptr); |
166 | 1.65M | input2 = _mm_loadu_si128((const __m128i *) thresh_ptr); |
167 | | /* Unsigned subtraction does Unsigned saturation so we |
168 | | have to use the signed operation */ |
169 | 1.65M | input1 = _mm_xor_si128(input1, sign_fix); |
170 | 1.65M | input2 = _mm_xor_si128(input2, sign_fix); |
171 | | /* Subtract the two */ |
172 | 1.65M | input2 = _mm_subs_epi8(input1, input2); |
173 | | /* Grab the sign mask */ |
174 | 1.65M | result_int = _mm_movemask_epi8(input2); |
175 | | /* bit wise reversal on 16 bit word */ |
176 | 1.65M | ht_data[0] = bitreverse[sse_data[0]]; |
177 | 1.65M | ht_data[1] = bitreverse[sse_data[1]]; |
178 | 1.65M | } |
179 | | #endif |
180 | | |
181 | | /* SSE2 and non-SSE2 implememntation of thresholding a row. Subtractive case |
182 | | There is some code replication between the two of these (additive and subtractive) |
183 | | that I need to go back and determine how we can combine them without |
184 | | any performance loss. */ |
185 | | void |
186 | | gx_ht_threshold_row_bit_sub(byte *contone, byte *threshold_strip, int contone_stride, |
187 | | byte *halftone, int dithered_stride, int width, |
188 | | int num_rows, int offset_bits) |
189 | 83.1k | { |
190 | | #ifndef HAVE_SSE2 |
191 | | int k, j; |
192 | | byte *contone_ptr; |
193 | | byte *thresh_ptr; |
194 | | byte *halftone_ptr; |
195 | | byte bit_init; |
196 | | |
197 | | /* For the moment just do a very slow compare until we get |
198 | | get this working. This could use some serious optimization */ |
199 | | width -= offset_bits; |
200 | | for (j = 0; j < num_rows; j++) { |
201 | | byte h; |
202 | | contone_ptr = contone; |
203 | | thresh_ptr = threshold_strip + contone_stride * j; |
204 | | halftone_ptr = halftone + dithered_stride * j; |
205 | | /* First get the left remainder portion. Put into MSBs of first byte */ |
206 | | bit_init = 0x80; |
207 | | h = 0; |
208 | | k = offset_bits; |
209 | | if (k > 0) { |
210 | | do { |
211 | | if (*contone_ptr++ > *thresh_ptr++) { |
212 | | h |= bit_init; |
213 | | } |
214 | | bit_init >>= 1; |
215 | | if (bit_init == 0) { |
216 | | bit_init = 0x80; |
217 | | *halftone_ptr++ = h; |
218 | | h = 0; |
219 | | } |
220 | | k--; |
221 | | } while (k > 0); |
222 | | bit_init = 0x80; |
223 | | *halftone_ptr++ = h; |
224 | | h = 0; |
225 | | if (offset_bits < 8) |
226 | | *halftone_ptr++ = 0; |
227 | | } |
228 | | /* Now get the rest, which will be 16 bit aligned. */ |
229 | | k = width; |
230 | | if (k > 0) { |
231 | | do { |
232 | | if (*contone_ptr++ > *thresh_ptr++) { |
233 | | h |= bit_init; |
234 | | } |
235 | | bit_init >>= 1; |
236 | | if (bit_init == 0) { |
237 | | bit_init = 0x80; |
238 | | *halftone_ptr++ = h; |
239 | | h = 0; |
240 | | } |
241 | | k--; |
242 | | } while (k > 0); |
243 | | if (bit_init != 0x80) { |
244 | | *halftone_ptr++ = h; |
245 | | } |
246 | | if ((width & 15) < 8) |
247 | | *halftone_ptr++ = 0; |
248 | | } |
249 | | } |
250 | | #else |
251 | 83.1k | byte *contone_ptr; |
252 | 83.1k | byte *thresh_ptr; |
253 | 83.1k | byte *halftone_ptr; |
254 | 83.1k | int num_tiles = (width - offset_bits + 15)>>4; |
255 | 83.1k | int k, j; |
256 | | |
257 | 368k | for (j = 0; j < num_rows; j++) { |
258 | | /* contone and thresh_ptr are 128 bit aligned. We do need to do this in |
259 | | two steps to ensure that we pack the bits in an aligned fashion |
260 | | into halftone_ptr. */ |
261 | 285k | contone_ptr = contone; |
262 | 285k | thresh_ptr = threshold_strip + contone_stride * j; |
263 | 285k | halftone_ptr = halftone + dithered_stride * j; |
264 | 285k | if (offset_bits > 0) { |
265 | | /* Since we allowed for 16 bits in our left remainder |
266 | | we can go directly in to the destination. threshold_16_SSE |
267 | | requires 128 bit alignment. contone_ptr and thresh_ptr |
268 | | are set up so that after we move in by offset_bits elements |
269 | | then we are 128 bit aligned. */ |
270 | 89.8k | threshold_16_SSE_unaligned(thresh_ptr, contone_ptr, |
271 | 89.8k | halftone_ptr); |
272 | 89.8k | halftone_ptr += 2; |
273 | 89.8k | thresh_ptr += offset_bits; |
274 | 89.8k | contone_ptr += offset_bits; |
275 | 89.8k | } |
276 | | /* Now we should have 128 bit aligned with our input data. Iterate |
277 | | over sets of 16 going directly into our HT buffer. Sources and |
278 | | halftone_ptr buffers should be padded to allow 15 bit overrun */ |
279 | 20.0M | for (k = 0; k < num_tiles; k++) { |
280 | 19.7M | threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr); |
281 | 19.7M | thresh_ptr += 16; |
282 | 19.7M | contone_ptr += 16; |
283 | 19.7M | halftone_ptr += 2; |
284 | 19.7M | } |
285 | 285k | } |
286 | 83.1k | #endif |
287 | 83.1k | } |
288 | | |
289 | | /* SSE2 and non-SSE2 implememntation of thresholding a row. additive case */ |
290 | | void |
291 | | gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stride, |
292 | | byte *halftone, int dithered_stride, int width, |
293 | | int num_rows, int offset_bits) |
294 | 1.87M | { |
295 | | #ifndef HAVE_SSE2 |
296 | | int k, j; |
297 | | byte *contone_ptr; |
298 | | byte *thresh_ptr; |
299 | | byte *halftone_ptr; |
300 | | byte bit_init; |
301 | | |
302 | | /* For the moment just do a very slow compare until we get |
303 | | get this working. This could use some serious optimization */ |
304 | | width -= offset_bits; |
305 | | for (j = 0; j < num_rows; j++) { |
306 | | byte h; |
307 | | contone_ptr = contone; |
308 | | thresh_ptr = threshold_strip + contone_stride * j; |
309 | | halftone_ptr = halftone + dithered_stride * j; |
310 | | /* First get the left remainder portion. Put into MSBs of first byte */ |
311 | | bit_init = 0x80; |
312 | | h = 0; |
313 | | k = offset_bits; |
314 | | if (k > 0) { |
315 | | do { |
316 | | if (*contone_ptr++ < *thresh_ptr++) { |
317 | | h |= bit_init; |
318 | | } |
319 | | bit_init >>= 1; |
320 | | if (bit_init == 0) { |
321 | | bit_init = 0x80; |
322 | | *halftone_ptr++ = h; |
323 | | h = 0; |
324 | | } |
325 | | k--; |
326 | | } while (k > 0); |
327 | | bit_init = 0x80; |
328 | | *halftone_ptr++ = h; |
329 | | h = 0; |
330 | | if (offset_bits < 8) |
331 | | *halftone_ptr++ = 0; |
332 | | } |
333 | | /* Now get the rest, which will be 16 bit aligned. */ |
334 | | k = width; |
335 | | if (k > 0) { |
336 | | do { |
337 | | if (*contone_ptr++ < *thresh_ptr++) { |
338 | | h |= bit_init; |
339 | | } |
340 | | bit_init >>= 1; |
341 | | if (bit_init == 0) { |
342 | | bit_init = 0x80; |
343 | | *halftone_ptr++ = h; |
344 | | h = 0; |
345 | | } |
346 | | k--; |
347 | | } while (k > 0); |
348 | | if (bit_init != 0x80) { |
349 | | *halftone_ptr++ = h; |
350 | | } |
351 | | if ((width & 15) < 8) |
352 | | *halftone_ptr++ = 0; |
353 | | } |
354 | | } |
355 | | #else |
356 | 1.87M | byte *contone_ptr; |
357 | 1.87M | byte *thresh_ptr; |
358 | 1.87M | byte *halftone_ptr; |
359 | 1.87M | int num_tiles = (width - offset_bits + 15)>>4; |
360 | 1.87M | int k, j; |
361 | | |
362 | 3.99M | for (j = 0; j < num_rows; j++) { |
363 | | /* contone and thresh_ptr are 128 bit aligned. We do need to do this in |
364 | | two steps to ensure that we pack the bits in an aligned fashion |
365 | | into halftone_ptr. */ |
366 | 2.12M | contone_ptr = contone; |
367 | 2.12M | thresh_ptr = threshold_strip + contone_stride * j; |
368 | 2.12M | halftone_ptr = halftone + dithered_stride * j; |
369 | 2.12M | if (offset_bits > 0) { |
370 | | /* Since we allowed for 16 bits in our left remainder |
371 | | we can go directly in to the destination. threshold_16_SSE |
372 | | requires 128 bit alignment. contone_ptr and thresh_ptr |
373 | | are set up so that after we move in by offset_bits elements |
374 | | then we are 128 bit aligned. */ |
375 | 1.56M | threshold_16_SSE_unaligned(contone_ptr, thresh_ptr, |
376 | 1.56M | halftone_ptr); |
377 | 1.56M | halftone_ptr += 2; |
378 | 1.56M | thresh_ptr += offset_bits; |
379 | 1.56M | contone_ptr += offset_bits; |
380 | 1.56M | } |
381 | | /* Now we should have 128 bit aligned with our input data. Iterate |
382 | | over sets of 16 going directly into our HT buffer. Sources and |
383 | | halftone_ptr buffers should be padded to allow 15 bit overrun */ |
384 | 261M | for (k = 0; k < num_tiles; k++) { |
385 | 259M | threshold_16_SSE(contone_ptr, thresh_ptr, halftone_ptr); |
386 | 259M | thresh_ptr += 16; |
387 | 259M | contone_ptr += 16; |
388 | 259M | halftone_ptr += 2; |
389 | 259M | } |
390 | 2.12M | } |
391 | 1.87M | #endif |
392 | 1.87M | } |
393 | | |
394 | | /* This thresholds a buffer that is LAND_BITS wide by data_length tall. |
395 | | Subtractive case */ |
396 | | void |
397 | | gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align, |
398 | | ht_landscape_info_t *ht_landscape, byte *halftone, |
399 | | int data_length) |
400 | 0 | { |
401 | 0 | __align16 byte contone[LAND_BITS]; |
402 | 0 | int position_start, position, curr_position; |
403 | 0 | int *widths = &(ht_landscape->widths[0]); |
404 | 0 | int local_widths[LAND_BITS]; |
405 | 0 | int num_contone = ht_landscape->num_contones; |
406 | 0 | int k, j, w, contone_out_posit; |
407 | 0 | byte *contone_ptr, *thresh_ptr, *halftone_ptr; |
408 | 0 | #ifdef PACIFY_VALGRIND |
409 | 0 | int extra = 0; |
410 | 0 | #endif |
411 | | |
412 | | /* Work through chunks of 16. */ |
413 | | /* Data may have come in left to right or right to left. */ |
414 | 0 | if (ht_landscape->index > 0) { |
415 | 0 | position = position_start = 0; |
416 | 0 | } else { |
417 | 0 | position = position_start = ht_landscape->curr_pos + 1; |
418 | 0 | } |
419 | 0 | thresh_ptr = thresh_align; |
420 | 0 | halftone_ptr = halftone; |
421 | | /* Copy the widths to a local array, and truncate the last one (which may |
422 | | * be the first one!) if required. */ |
423 | 0 | k = 0; |
424 | 0 | for (j = 0; j < num_contone; j++) |
425 | 0 | k += (local_widths[j] = widths[position_start+j]); |
426 | 0 | if (k > LAND_BITS) { |
427 | 0 | if (ht_landscape->index > 0) { |
428 | 0 | local_widths[num_contone-1] -= k-LAND_BITS; |
429 | 0 | } else { |
430 | 0 | local_widths[0] -= k-LAND_BITS; |
431 | 0 | } |
432 | 0 | } |
433 | 0 | #ifdef PACIFY_VALGRIND |
434 | 0 | if (k < LAND_BITS) { |
435 | 0 | extra = LAND_BITS - k; |
436 | 0 | } |
437 | 0 | #endif |
438 | 0 | for (k = data_length; k > 0; k--) { /* Loop on rows */ |
439 | 0 | contone_ptr = &(contone_align[position]); /* Point us to our row start */ |
440 | 0 | curr_position = 0; /* We use this in keeping track of widths */ |
441 | 0 | contone_out_posit = 0; /* Our index out */ |
442 | 0 | for (j = num_contone; j > 0; j--) { |
443 | 0 | byte c = *contone_ptr; |
444 | | /* The microsoft compiler, cleverly spots that the following loop |
445 | | * can be replaced by a memset. Unfortunately, it can't spot that |
446 | | * the typical length values of the memset are so small that we'd |
447 | | * be better off doing it the slow way. We therefore introduce a |
448 | | * sneaky 'volatile' cast below that stops this optimisation. */ |
449 | 0 | w = local_widths[curr_position]; |
450 | 0 | do { |
451 | 0 | ((volatile byte *)contone)[contone_out_posit] = c; |
452 | 0 | contone_out_posit++; |
453 | 0 | } while (--w); |
454 | 0 | #ifdef PACIFY_VALGRIND |
455 | 0 | if (extra) |
456 | 0 | memset(contone+contone_out_posit, 0, extra); |
457 | 0 | #endif |
458 | 0 | curr_position++; /* Move us to the next position in our width array */ |
459 | 0 | contone_ptr++; /* Move us to a new location in our contone buffer */ |
460 | 0 | } |
461 | | /* Now we have our left justified and expanded contone data for |
462 | | LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */ |
463 | 0 | contone_ptr = &contone[0]; |
464 | 0 | #if LAND_BITS > 16 |
465 | 0 | j = LAND_BITS; |
466 | 0 | do { |
467 | 0 | #endif |
468 | 0 | #ifdef HAVE_SSE2 |
469 | 0 | threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr); |
470 | | #else |
471 | | threshold_16_bit(thresh_ptr, contone_ptr, halftone_ptr); |
472 | | #endif |
473 | 0 | thresh_ptr += 16; |
474 | 0 | position += 16; |
475 | 0 | halftone_ptr += 2; |
476 | 0 | contone_ptr += 16; |
477 | 0 | #if LAND_BITS > 16 |
478 | 0 | j -= 16; |
479 | 0 | } while (j > 0); |
480 | 0 | #endif |
481 | 0 | } |
482 | 0 | } |
483 | | |
484 | | /* This thresholds a buffer that is LAND_BITS wide by data_length tall. |
485 | | Additive case. Note I could likely do some code reduction between |
486 | | the additive and subtractive cases */ |
487 | | void |
488 | | gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, |
489 | | ht_landscape_info_t *ht_landscape, byte *halftone, |
490 | | int data_length) |
491 | 1 | { |
492 | 1 | __align16 byte contone[LAND_BITS]; |
493 | 1 | int position_start, position, curr_position; |
494 | 1 | int *widths = &(ht_landscape->widths[0]); |
495 | 1 | int local_widths[LAND_BITS]; |
496 | 1 | int num_contone = ht_landscape->num_contones; |
497 | 1 | int k, j, w, contone_out_posit; |
498 | 1 | byte *contone_ptr, *thresh_ptr, *halftone_ptr; |
499 | 1 | #ifdef PACIFY_VALGRIND |
500 | 1 | int extra = 0; |
501 | 1 | #endif |
502 | | |
503 | | /* Work through chunks of 16. */ |
504 | | /* Data may have come in left to right or right to left. */ |
505 | 1 | if (ht_landscape->index > 0) { |
506 | 1 | position = position_start = 0; |
507 | 1 | } else { |
508 | 0 | position = position_start = ht_landscape->curr_pos + 1; |
509 | 0 | } |
510 | 1 | thresh_ptr = thresh_align; |
511 | 1 | halftone_ptr = halftone; |
512 | | /* Copy the widths to a local array, and truncate the last one (which may |
513 | | * be the first one!) if required. */ |
514 | 1 | k = 0; |
515 | 2 | for (j = 0; j < num_contone; j++) |
516 | 1 | k += (local_widths[j] = widths[position_start+j]); |
517 | 1 | if (k > LAND_BITS) { |
518 | 0 | if (ht_landscape->index > 0) { |
519 | 0 | local_widths[num_contone-1] -= k-LAND_BITS; |
520 | 0 | } else { |
521 | 0 | local_widths[0] -= k-LAND_BITS; |
522 | 0 | } |
523 | 0 | } |
524 | 1 | #ifdef PACIFY_VALGRIND |
525 | 1 | if (k < LAND_BITS) { |
526 | 1 | extra = LAND_BITS - k; |
527 | 1 | } |
528 | 1 | #endif |
529 | 4 | for (k = data_length; k > 0; k--) { /* Loop on rows */ |
530 | 3 | contone_ptr = &(contone_align[position]); /* Point us to our row start */ |
531 | 3 | curr_position = 0; /* We use this in keeping track of widths */ |
532 | 3 | contone_out_posit = 0; /* Our index out */ |
533 | 6 | for (j = num_contone; j > 0; j--) { |
534 | 3 | byte c = *contone_ptr; |
535 | | /* The microsoft compiler, cleverly spots that the following loop |
536 | | * can be replaced by a memset. Unfortunately, it can't spot that |
537 | | * the typical length values of the memset are so small that we'd |
538 | | * be better off doing it the slow way. We therefore introduce a |
539 | | * sneaky 'volatile' cast below that stops this optimisation. */ |
540 | 3 | w = local_widths[curr_position]; |
541 | 9 | do { |
542 | 9 | ((volatile byte *)contone)[contone_out_posit] = c; |
543 | 9 | contone_out_posit++; |
544 | 9 | } while (--w); |
545 | 3 | #ifdef PACIFY_VALGRIND |
546 | 3 | if (extra) |
547 | 3 | memset(contone+contone_out_posit, 0, extra); |
548 | 3 | #endif |
549 | 3 | curr_position++; /* Move us to the next position in our width array */ |
550 | 3 | contone_ptr++; /* Move us to a new location in our contone buffer */ |
551 | 3 | } |
552 | | /* Now we have our left justified and expanded contone data for |
553 | | LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */ |
554 | 3 | contone_ptr = &contone[0]; |
555 | 3 | #if LAND_BITS > 16 |
556 | 3 | j = LAND_BITS; |
557 | 12 | do { |
558 | 12 | #endif |
559 | 12 | #ifdef HAVE_SSE2 |
560 | 12 | threshold_16_SSE(contone_ptr, thresh_ptr, halftone_ptr); |
561 | | #else |
562 | | threshold_16_bit(contone_ptr, thresh_ptr, halftone_ptr); |
563 | | #endif |
564 | 12 | thresh_ptr += 16; |
565 | 12 | position += 16; |
566 | 12 | halftone_ptr += 2; |
567 | 12 | contone_ptr += 16; |
568 | 12 | #if LAND_BITS > 16 |
569 | 12 | j -= 16; |
570 | 12 | } while (j > 0); |
571 | 3 | #endif |
572 | 3 | } |
573 | 1 | } |
574 | | |
575 | | int |
576 | | gxht_thresh_image_init(gx_image_enum *penum) |
577 | 149k | { |
578 | 149k | int code = 0; |
579 | 149k | fixed ox; |
580 | 149k | int temp; |
581 | 149k | int dev_width, max_height; |
582 | 149k | int spp_out; |
583 | 149k | int k; |
584 | 149k | gx_ht_order *d_order; |
585 | 149k | gx_dda_fixed dda_ht; |
586 | | |
587 | 149k | if (gx_device_must_halftone(penum->dev)) { |
588 | 149k | if (penum->pgs != NULL && penum->pgs->dev_ht[HT_OBJTYPE_DEFAULT] != NULL) { |
589 | 149k | gx_device_halftone *pdht = gx_select_dev_ht(penum->pgs); |
590 | | |
591 | 300k | for (k = 0; k < pdht->num_comp; k++) { |
592 | 150k | d_order = &(pdht->components[k].corder); |
593 | 150k | code = gx_ht_construct_threshold(d_order, penum->dev, |
594 | 150k | penum->pgs, k); |
595 | 150k | if (code < 0 ) { |
596 | 0 | return gs_rethrow(code, "threshold creation failed"); |
597 | 0 | } |
598 | 150k | } |
599 | 149k | } else { |
600 | 0 | return -1; |
601 | 0 | } |
602 | 149k | } |
603 | 149k | spp_out = penum->dev->color_info.num_components; |
604 | | /* Precompute values needed for rasterizing. */ |
605 | 149k | penum->dxx = float2fixed(penum->matrix.xx + fixed2float(fixed_epsilon) / 2); |
606 | | /* If the image is landscaped then we want to maintain a buffer |
607 | | that is sufficiently large so that we can hold a byte |
608 | | of halftoned data along the column. This way we avoid doing |
609 | | multiple writes into the same position over and over. |
610 | | The size of the buffer we need depends upon the bitdepth of |
611 | | the output device, the number of device coloranants and the |
612 | | number of colorants in the source space. Note we will |
613 | | need to eventually consider multi-level halftone case |
614 | | here too. For now, to make use of the SSE2 stuff, we would |
615 | | like to have a multiple of 16 bytes of data to process at a time. |
616 | | So we will collect the columns of data in a buffer that is LAND_BITS |
617 | | wide. We will also keep track of the widths of each column. When |
618 | | the total width count reaches LAND_BITS, we will create our |
619 | | threshold array and apply it. We may have one column that is |
620 | | buffered between calls in this case. Also if a call is made |
621 | | with h=0 we will flush the buffer as we are at the end of the |
622 | | data. */ |
623 | 149k | if (penum->posture == image_landscape) { |
624 | 3 | int col_length = fixed2int_var_rounded(any_abs(penum->x_extent.y)); |
625 | 3 | dda_ht = penum->dda.pixel0.y; |
626 | 3 | if (penum->dxx > 0) |
627 | 3 | dda_translate(dda_ht, -fixed_epsilon); /* to match rounding in non-fast code */ |
628 | | |
629 | 3 | ox = dda_current(penum->dda.pixel0.x); |
630 | 3 | temp = gxht_dda_length(&dda_ht, penum->rect.w); |
631 | 3 | if (col_length < temp) |
632 | 0 | col_length = temp; /* choose max to make sure line_size is large enough */ |
633 | 3 | temp = (col_length + LAND_BITS)/LAND_BITS; /* round up to allow for offset bits */ |
634 | | /* bitmap_raster() expects the width in bits, hence "* 8" */ |
635 | 3 | penum->line_size = bitmap_raster((temp * LAND_BITS) * 8); /* The stride */ |
636 | | /* Now we need at most LAND_BITS of these */ |
637 | 3 | penum->line = gs_alloc_bytes(penum->memory, |
638 | 3 | LAND_BITS * penum->line_size * spp_out + 16, |
639 | 3 | "gxht_thresh"); |
640 | | /* Same with this. However, we only need one plane here */ |
641 | 3 | penum->thresh_buffer = gs_alloc_bytes(penum->memory, |
642 | 3 | penum->line_size * LAND_BITS + 16, |
643 | 3 | "gxht_thresh"); |
644 | | /* That maps into (LAND_BITS/8) bytes of Halftone data */ |
645 | 3 | penum->ht_buffer = |
646 | 3 | gs_alloc_bytes(penum->memory, |
647 | 3 | penum->line_size * (LAND_BITS>>3) * spp_out, |
648 | 3 | "gxht_thresh"); |
649 | 3 | penum->ht_plane_height = penum->line_size; |
650 | 3 | penum->ht_stride = penum->line_size; |
651 | 3 | if (penum->line == NULL || penum->thresh_buffer == NULL |
652 | 3 | || penum->ht_buffer == NULL) |
653 | 0 | return -1; |
654 | 3 | penum->ht_landscape.count = 0; |
655 | 3 | penum->ht_landscape.num_contones = 0; |
656 | 3 | if (penum->y_extent.x < 0) { |
657 | | /* Going right to left */ |
658 | 1 | penum->ht_landscape.curr_pos = LAND_BITS-1; |
659 | 1 | penum->ht_landscape.index = -1; |
660 | 2 | } else { |
661 | | /* Going left to right */ |
662 | 2 | penum->ht_landscape.curr_pos = 0; |
663 | 2 | penum->ht_landscape.index = 1; |
664 | 2 | } |
665 | 3 | if (penum->x_extent.y < 0) { |
666 | 0 | penum->ht_landscape.flipy = true; |
667 | 0 | penum->ht_landscape.y_pos = |
668 | 0 | fixed2int_pixround_perfect(dda_current(penum->dda.pixel0.y) + penum->x_extent.y); |
669 | 3 | } else { |
670 | 3 | penum->ht_landscape.flipy = false; |
671 | 3 | penum->ht_landscape.y_pos = |
672 | 3 | fixed2int_pixround_perfect(dda_current(penum->dda.pixel0.y)); |
673 | 3 | } |
674 | 3 | memset(&(penum->ht_landscape.widths[0]), 0, sizeof(int)*LAND_BITS); |
675 | 3 | penum->ht_landscape.offset_set = false; |
676 | 3 | penum->ht_offset_bits = 0; /* Will get set in call to render */ |
677 | 3 | if (code >= 0) { |
678 | 3 | #if defined(DEBUG) || defined(PACIFY_VALGRIND) |
679 | 3 | memset(penum->line, 0, LAND_BITS * penum->line_size * spp_out + 16); |
680 | 3 | memset(penum->ht_buffer, 0, penum->line_size * (LAND_BITS>>3) * spp_out); |
681 | 3 | memset(penum->thresh_buffer, 0, LAND_BITS * penum->line_size + 16); |
682 | 3 | #endif |
683 | 3 | } |
684 | 149k | } else { |
685 | | /* In the portrait case we allocate a single line buffer |
686 | | in device width, a threshold buffer of the same size |
687 | | and possibly wider and the buffer for the halftoned |
688 | | bits. We have to do a bit of work to enable 16 byte |
689 | | boundary after an offset to ensure that we can make use |
690 | | of the SSE2 operations for thresholding. We do the |
691 | | allocations now to avoid doing them with every line */ |
692 | 149k | dda_ht = penum->dda.pixel0.x; |
693 | 149k | if (penum->dxx > 0) |
694 | 149k | dda_translate(dda_ht, -fixed_epsilon); /* to match rounding in non-fast code */ |
695 | | /* Initialize the ht_landscape stuff to zero */ |
696 | 149k | memset(&(penum->ht_landscape), 0, sizeof(ht_landscape_info_t)); |
697 | 149k | ox = dda_current(dda_ht); |
698 | 149k | dev_width = gxht_dda_length(&dda_ht, penum->rect.w); |
699 | | /* Get the bit position so that we can do a copy_mono for |
700 | | the left remainder and then 16 bit aligned copies for the |
701 | | rest. The right remainder will be OK as it will land in |
702 | | the MSBit positions. Note the #define chunk bits16 in |
703 | | gdevm1.c. Allow also for a 15 sample over run. |
704 | | */ |
705 | 149k | penum->ht_offset_bits = (-fixed2int_var_rounded(ox)) & (bitmap_raster(1) - 1); |
706 | 149k | if (penum->ht_offset_bits > 0) { |
707 | 107k | penum->ht_stride = bitmap_raster((7 + (dev_width + 4)) + (ARCH_SIZEOF_LONG * 8)); |
708 | 107k | } else { |
709 | 42.4k | penum->ht_stride = bitmap_raster((7 + (dev_width + 2)) + (ARCH_SIZEOF_LONG * 8)); |
710 | 42.4k | } |
711 | | /* We want to figure out the maximum height that we may |
712 | | have in taking a single source row and going to device |
713 | | space */ |
714 | 149k | max_height = (int) ceil(fixed2float(any_abs(penum->dst_height)) / |
715 | 149k | (float) penum->Height); |
716 | 149k | if (max_height <= 0) |
717 | 6 | return -1; /* shouldn't happen, but check so we don't div by zero */ |
718 | 149k | if (penum->ht_stride * spp_out > max_int / max_height) |
719 | 0 | return -1; /* overflow */ |
720 | | |
721 | 149k | penum->ht_buffer = |
722 | 149k | gs_alloc_bytes(penum->memory, |
723 | 149k | (size_t)penum->ht_stride * max_height * spp_out, |
724 | 149k | "gxht_thresh"); |
725 | 149k | penum->ht_plane_height = penum->ht_stride * max_height; |
726 | | /* We want to have 128 bit alignement for our contone and |
727 | | threshold strips so that we can use SSE operations |
728 | | in the threshold operation. Add in a minor buffer and offset |
729 | | to ensure this. If gs_alloc_bytes provides at least 16 |
730 | | bit alignment so we may need to move 14 bytes. However, the |
731 | | HT process is split in two operations. One that involves |
732 | | the HT of a left remainder and the rest which ensures that |
733 | | we pack in the HT data in the bits with no skew for a fast |
734 | | copy into the gdevm1 device (16 bit copies). So, we |
735 | | need to account for those pixels which occur first and which |
736 | | are NOT aligned for the contone buffer. After we offset |
737 | | by this remainder portion we should be 128 bit aligned. |
738 | | Also allow a 15 sample over run during the execution. */ |
739 | 149k | temp = (int) ceil((float) ((dev_width + 15.0) + 15.0)/16.0); |
740 | 149k | penum->line_size = bitmap_raster(temp * 16 * 8); /* The stride */ |
741 | 149k | if (penum->line_size > max_int / max_height) { |
742 | 0 | gs_free_object(penum->memory, penum->ht_buffer, "gxht_thresh"); |
743 | 0 | penum->ht_buffer = NULL; |
744 | 0 | return -1; /* thresh_buffer size overflow */ |
745 | 0 | } |
746 | 149k | penum->line = gs_alloc_bytes(penum->memory, penum->line_size * spp_out, |
747 | 149k | "gxht_thresh"); |
748 | 149k | penum->thresh_buffer = gs_alloc_bytes(penum->memory, |
749 | 149k | (size_t)penum->line_size * max_height, |
750 | 149k | "gxht_thresh"); |
751 | 149k | if (penum->line == NULL || penum->thresh_buffer == NULL || |
752 | 149k | penum->ht_buffer == NULL) { |
753 | 0 | return -1; |
754 | 149k | } else { |
755 | 149k | #if defined(DEBUG) || defined(PACIFY_VALGRIND) |
756 | 149k | memset(penum->line, 0, penum->line_size * spp_out); |
757 | 149k | memset(penum->ht_buffer, 0, penum->ht_stride * max_height * spp_out); |
758 | 149k | memset(penum->thresh_buffer, 0, penum->line_size * max_height); |
759 | 149k | #endif |
760 | 149k | } |
761 | 149k | } |
762 | 149k | return code; |
763 | 149k | } |
764 | | |
765 | | static void |
766 | | fill_threshold_buffer(byte *dest_strip, byte *src, byte *src_strip, int src_width, |
767 | | int left_offset, int left_width, int num_tiles, |
768 | | int right_width) |
769 | 2.41M | { |
770 | 2.41M | byte *ptr_out_temp = dest_strip; |
771 | 2.41M | int ii; |
772 | | |
773 | | /* Make sure we don't try and read before the start of the threshold array. This can happen |
774 | | * if we drop to the beginning of the array, AND we have a negative left_offset. If we do |
775 | | * have a negative left_offset this represents an area we won't actually be using, but we need |
776 | | * to move along the threshold array until we get to the point where we copy data we will use. |
777 | | * So lets simply avoid reading before the start of the data. We can leave the destination |
778 | | * buffer uninitialised because we won't be reading from that area. Bug #706795 but the ASAN |
779 | | * error occurs on a number of input files in the test suite. |
780 | | */ |
781 | 2.41M | if (src_strip + left_offset < src) { |
782 | 2.07k | int under = src - (src_strip + left_offset); |
783 | 2.07k | left_offset += under; |
784 | 2.07k | ptr_out_temp += under; |
785 | 2.07k | left_width -= under; |
786 | 2.07k | if (left_width < 0) |
787 | 0 | left_width = 0; |
788 | 2.07k | } |
789 | | /* Left part */ |
790 | 2.41M | memcpy(ptr_out_temp, src_strip + left_offset, left_width); |
791 | 2.41M | ptr_out_temp += left_width; |
792 | | /* Now the full parts */ |
793 | 732M | for (ii = 0; ii < num_tiles; ii++){ |
794 | 729M | memcpy(ptr_out_temp, src_strip, src_width); |
795 | 729M | ptr_out_temp += src_width; |
796 | 729M | } |
797 | | /* Now the remainder */ |
798 | 2.41M | memcpy(ptr_out_temp, src_strip, right_width); |
799 | 2.41M | #ifdef PACIFY_VALGRIND |
800 | 2.41M | ptr_out_temp += right_width; |
801 | 2.41M | ii = (dest_strip-ptr_out_temp) % (LAND_BITS-1); |
802 | 2.41M | if (ii > 0) |
803 | 0 | memset(ptr_out_temp, 0, ii); |
804 | 2.41M | #endif |
805 | 2.41M | } |
806 | | /* This only moves the data but does not do a reset of the variables. Used |
807 | | for case where we have multiple bands of data (e.g. CMYK output) */ |
808 | | static void |
809 | | move_landscape_buffer(ht_landscape_info_t *ht_landscape, byte *contone_align, |
810 | | int data_length) |
811 | 0 | { |
812 | 0 | int k; |
813 | 0 | int position_curr, position_new; |
814 | |
|
815 | 0 | if (ht_landscape->index < 0) { |
816 | | /* Moving right to left, move column to far right */ |
817 | 0 | position_curr = ht_landscape->curr_pos + 1; |
818 | 0 | position_new = LAND_BITS-1; |
819 | 0 | } else { |
820 | | /* Moving left to right, move column to far left */ |
821 | 0 | position_curr = ht_landscape->curr_pos - 1; |
822 | 0 | position_new = 0; |
823 | 0 | } |
824 | 0 | if (position_curr != position_new) { |
825 | 0 | for (k = 0; k < data_length; k++) { |
826 | 0 | contone_align[position_new] = contone_align[position_curr]; |
827 | 0 | position_curr += LAND_BITS; |
828 | 0 | position_new += LAND_BITS; |
829 | 0 | } |
830 | 0 | } |
831 | 0 | } |
832 | | |
833 | | |
834 | | /* If we are in here, we had data left over. Move it to the proper position |
835 | | and get ht_landscape_info_t set properly */ |
836 | | static void |
837 | | reset_landscape_buffer(ht_landscape_info_t *ht_landscape, byte *contone_align, |
838 | | int data_length, int num_used) |
839 | 0 | { |
840 | 0 | int delta; |
841 | 0 | int curr_x_pos = ht_landscape->xstart; |
842 | |
|
843 | 0 | if (ht_landscape->index < 0) { |
844 | | /* Moving right to left, move column to far right */ |
845 | 0 | delta = ht_landscape->count - num_used; |
846 | 0 | memset(&(ht_landscape->widths[0]), 0, sizeof(int)*LAND_BITS); |
847 | 0 | ht_landscape->widths[LAND_BITS-1] = delta; |
848 | 0 | ht_landscape->curr_pos = LAND_BITS-2; |
849 | 0 | ht_landscape->xstart = curr_x_pos - num_used; |
850 | 0 | } else { |
851 | | /* Moving left to right, move column to far left */ |
852 | 0 | delta = ht_landscape->count - num_used; |
853 | 0 | memset(&(ht_landscape->widths[0]), 0, sizeof(int)*LAND_BITS); |
854 | 0 | ht_landscape->widths[0] = delta; |
855 | 0 | ht_landscape->curr_pos = 1; |
856 | 0 | ht_landscape->xstart = curr_x_pos + num_used; |
857 | 0 | } |
858 | 0 | ht_landscape->count = delta; |
859 | 0 | ht_landscape->num_contones = 1; |
860 | 0 | } |
861 | | |
862 | | /* This performs a thresholding operation on multiple planes of data and |
863 | | stores the bits into a planar buffer which can then be used for |
864 | | copy_planes */ |
865 | | int |
866 | | gxht_thresh_planes(gx_image_enum *penum, fixed xrun, |
867 | | int dest_width, int dest_height, |
868 | | byte *thresh_align, gx_device * dev, int offset_contone[], |
869 | | int contone_stride) |
870 | 1.89M | { |
871 | 1.89M | int thresh_width, thresh_height, dx; |
872 | 1.89M | int left_rem_end, left_width, vdi; |
873 | 1.89M | int num_full_tiles, right_tile_width; |
874 | 1.89M | int k, jj, dy, j; |
875 | 1.89M | byte *thresh_tile; |
876 | 1.89M | int position; |
877 | 1.89M | bool replicate_tile; |
878 | 1.89M | image_posture posture = penum->posture; |
879 | 1.89M | const int y_pos = penum->yci; |
880 | 1.89M | int width = 0; /* Init to silence compiler warnings */ |
881 | 1.89M | byte *ptr_out, *row_ptr, *ptr_out_temp; |
882 | 1.89M | byte *threshold; |
883 | 1.89M | int init_tile, in_row_offset, ii, num_tiles, tile_remainder; |
884 | 1.89M | int offset_bits = penum->ht_offset_bits; |
885 | 1.89M | byte *halftone; |
886 | 1.89M | int dithered_stride = penum->ht_stride; |
887 | 1.89M | bool is_planar_dev = dev->num_planar_planes; |
888 | 1.89M | gx_color_index dev_white = gx_device_white(dev); |
889 | 1.89M | gx_color_index dev_black = gx_device_black(dev); |
890 | 1.89M | int spp_out = dev->color_info.num_components; |
891 | 1.89M | byte *contone_align = NULL; /* Init to silence compiler warnings */ |
892 | 1.89M | gx_device_halftone *pdht = gx_select_dev_ht(penum->pgs); |
893 | | |
894 | | /* Go ahead and fill the threshold line buffer with tiled threshold values. |
895 | | First just grab the row or column that we are going to tile with and |
896 | | then do memcpy into the buffer */ |
897 | | |
898 | | /* Figure out the tile steps. Left offset, Number of tiles, Right offset. */ |
899 | 1.89M | switch (posture) { |
900 | 1.89M | case image_portrait: |
901 | 1.89M | vdi = penum->hci; |
902 | | /* Iterate over the vdi and fill up our threshold buffer. We |
903 | | also need to loop across the planes of data */ |
904 | 3.84M | for (j = 0; j < spp_out; j++) { |
905 | 1.95M | bool threshold_inverted = pdht->components[j].corder.threshold_inverted; |
906 | | |
907 | 1.95M | thresh_width = pdht->components[j].corder.width; |
908 | 1.95M | thresh_height = pdht->components[j].corder.full_height; |
909 | 1.95M | halftone = penum->ht_buffer + j * vdi * dithered_stride; |
910 | | /* Compute the tiling positions with dest_width */ |
911 | 1.95M | dx = (fixed2int_var_rounded(xrun) + penum->pgs->screen_phase[0].x) % thresh_width; |
912 | | /* Left remainder part */ |
913 | 1.95M | left_rem_end = min(dx + dest_width, thresh_width); |
914 | | /* The left width of our tile part */ |
915 | 1.95M | left_width = left_rem_end - dx; |
916 | | /* Now the middle part */ |
917 | 1.95M | num_full_tiles = |
918 | 1.95M | (int)fastfloor((dest_width - left_width)/ (float) thresh_width); |
919 | | /* Now the right part */ |
920 | 1.95M | right_tile_width = dest_width - num_full_tiles * thresh_width - |
921 | 1.95M | left_width; |
922 | | /* Get the proper threshold for the colorant count */ |
923 | 1.95M | threshold = pdht->components[j].corder.threshold; |
924 | 1.95M | if (threshold == NULL) |
925 | 0 | return_error(gs_error_unregistered); |
926 | | /* Point to the proper contone data */ |
927 | 1.95M | contone_align = penum->line + contone_stride * j + |
928 | 1.95M | offset_contone[j]; |
929 | 4.36M | for (k = 0; k < vdi; k++) { |
930 | | /* Get a pointer to our tile row */ |
931 | 2.41M | dy = (penum->yci + k - |
932 | 2.41M | penum->pgs->screen_phase[0].y) % thresh_height; |
933 | 2.41M | if (dy < 0) |
934 | 29 | dy += thresh_height; |
935 | 2.41M | thresh_tile = threshold + thresh_width * dy; |
936 | | /* Fill the buffer, can be multiple rows. Make sure |
937 | | to update with stride */ |
938 | 2.41M | position = contone_stride * k; |
939 | | /* Tile into the 128 bit aligned threshold strip */ |
940 | 2.41M | fill_threshold_buffer(&(thresh_align[position]), threshold, |
941 | 2.41M | thresh_tile, thresh_width, dx, left_width, |
942 | 2.41M | num_full_tiles, right_tile_width); |
943 | 2.41M | } |
944 | | /* Apply the threshold operation */ |
945 | 1.95M | if (offset_bits > dest_width) |
946 | 10.4k | offset_bits = dest_width; |
947 | | |
948 | 1.95M | if (threshold_inverted || |
949 | 1.95M | (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE && is_planar_dev)) { |
950 | 83.1k | gx_ht_threshold_row_bit_sub(contone_align, thresh_align, contone_stride, |
951 | 83.1k | halftone, dithered_stride, dest_width, vdi, |
952 | 83.1k | offset_bits); |
953 | 1.87M | } else { |
954 | 1.87M | gx_ht_threshold_row_bit(contone_align, thresh_align, contone_stride, |
955 | 1.87M | halftone, dithered_stride, dest_width, vdi, |
956 | 1.87M | offset_bits); |
957 | 1.87M | } |
958 | 1.95M | } |
959 | | /* FIXME: An improvement here would be to generate the initial |
960 | | * offset_bits at the correct offset within the byte so that they |
961 | | * align with the remainder of the line. This would mean not |
962 | | * always packing them into the first offset_bits (in MSB order) |
963 | | * of our 16 bit word, but rather into the last offset_bits |
964 | | * (in MSB order) (except when the entire run is small!). |
965 | | * |
966 | | * This would enable us to do just one aligned copy_mono call for |
967 | | * the entire scanline. */ |
968 | | /* Now do the copy mono or copy plane operation */ |
969 | | /* First the left remainder bits */ |
970 | 1.89M | if (offset_bits > 0) { |
971 | 1.38M | int x_pos = fixed2int_var_rounded(xrun); |
972 | 1.38M | if (!is_planar_dev) { |
973 | 1.37M | (*dev_proc(dev, copy_mono)) (dev, penum->ht_buffer, 0, dithered_stride, |
974 | 1.37M | gx_no_bitmap_id, x_pos, y_pos, |
975 | 1.37M | offset_bits, vdi, dev_white, |
976 | 1.37M | dev_black); |
977 | 1.37M | } else { |
978 | 11.7k | (*dev_proc(dev, copy_planes)) (dev, penum->ht_buffer, 0, dithered_stride, |
979 | 11.7k | gx_no_bitmap_id, x_pos, y_pos, |
980 | 11.7k | offset_bits, vdi, vdi); |
981 | 11.7k | } |
982 | 1.38M | } |
983 | 1.89M | if ((dest_width - offset_bits) > 0 ) { |
984 | | /* Now the primary aligned bytes */ |
985 | 1.88M | int curr_width = dest_width - offset_bits; |
986 | 1.88M | int x_pos = fixed2int_var_rounded(xrun) + offset_bits; |
987 | | /* FIXME: This assumes the allowed offset_bits will always be <= 16 */ |
988 | 1.88M | int xoffs = offset_bits > 0 ? 16 : 0; |
989 | | |
990 | 1.88M | if (!is_planar_dev) { |
991 | 1.86M | (*dev_proc(dev, copy_mono)) (dev, penum->ht_buffer, xoffs, dithered_stride, |
992 | 1.86M | gx_no_bitmap_id, x_pos, y_pos, |
993 | 1.86M | curr_width, vdi, dev_white, |
994 | 1.86M | dev_black); |
995 | 1.86M | } else { |
996 | 20.6k | (*dev_proc(dev, copy_planes)) (dev, penum->ht_buffer, xoffs, dithered_stride, |
997 | 20.6k | gx_no_bitmap_id, x_pos, y_pos, |
998 | 20.6k | curr_width, vdi, vdi); |
999 | 20.6k | } |
1000 | 1.88M | } |
1001 | | |
1002 | 1.89M | break; |
1003 | 2 | case image_landscape: |
1004 | | /* Go ahead and paint the chunk if we have LAND_BITS values or a |
1005 | | * partial to get us in sync with the 1 bit devices 16 bit |
1006 | | * positions. */ |
1007 | 2 | vdi = penum->wci; |
1008 | | /* Now do the haftoning into our buffer. We basically check |
1009 | | first if we have enough data or are all done */ |
1010 | 3 | while ( (penum->ht_landscape.count >= LAND_BITS || |
1011 | 3 | ((penum->ht_landscape.count >= offset_bits) && |
1012 | 3 | penum->ht_landscape.offset_set))) { |
1013 | | /* Go ahead and 2D tile in the threshold buffer at this time */ |
1014 | | /* Always work the tiling from the upper left corner of our |
1015 | | LAND_BITS columns */ |
1016 | 2 | for (j = 0; j < spp_out; j++) { |
1017 | 1 | halftone = penum->ht_buffer + |
1018 | 1 | j * penum->ht_plane_height * (LAND_BITS>>3); |
1019 | 1 | thresh_width = pdht->components[j].corder.width; |
1020 | 1 | thresh_height = |
1021 | 1 | pdht->components[j].corder.full_height; |
1022 | | /* Get the proper threshold for the colorant count */ |
1023 | 1 | threshold = pdht->components[j].corder.threshold; |
1024 | 1 | if (threshold == NULL) |
1025 | 0 | return_error(gs_error_unregistered); |
1026 | | /* Point to the proper contone data */ |
1027 | 1 | contone_align = penum->line + offset_contone[j] + |
1028 | 1 | LAND_BITS * j * contone_stride; |
1029 | 1 | if (penum->ht_landscape.offset_set) { |
1030 | 1 | width = offset_bits; |
1031 | 1 | } else { |
1032 | 0 | width = LAND_BITS; |
1033 | 0 | } |
1034 | 1 | if (penum->y_extent.x < 0) { |
1035 | 0 | dx = penum->ht_landscape.xstart - width + 1; |
1036 | 1 | } else { |
1037 | 1 | dx = penum->ht_landscape.xstart; |
1038 | 1 | } |
1039 | 1 | dx = (dx + penum->pgs->screen_phase[0].x) % thresh_width; |
1040 | 1 | if (dx < 0) |
1041 | 0 | dx += thresh_width; |
1042 | 1 | dy = (penum->ht_landscape.y_pos - |
1043 | 1 | penum->pgs->screen_phase[0].y) % thresh_height; |
1044 | 1 | if (dy < 0) |
1045 | 0 | dy += thresh_height; |
1046 | | /* Left remainder part */ |
1047 | 1 | left_rem_end = min(dx + LAND_BITS, thresh_width); |
1048 | 1 | left_width = left_rem_end - dx; |
1049 | | /* Now the middle part */ |
1050 | 1 | num_full_tiles = (LAND_BITS - left_width) / thresh_width; |
1051 | | /* Now the right part */ |
1052 | 1 | right_tile_width = |
1053 | 1 | LAND_BITS - num_full_tiles * thresh_width - left_width; |
1054 | | /* Now loop over the y stuff */ |
1055 | 1 | ptr_out = thresh_align; |
1056 | | /* Do this in three parts. We do a top part, followed by |
1057 | | larger mem copies followed by a bottom partial. After |
1058 | | a slower initial fill we are able to do larger faster |
1059 | | expansions */ |
1060 | 1 | if (dest_height <= 2 * thresh_height) { |
1061 | 1 | init_tile = dest_height; |
1062 | 1 | replicate_tile = false; |
1063 | 1 | } else { |
1064 | 0 | init_tile = thresh_height; |
1065 | 0 | replicate_tile = true; |
1066 | 0 | } |
1067 | 4 | for (jj = 0; jj < init_tile; jj++) { |
1068 | 3 | in_row_offset = (jj + dy) % thresh_height; |
1069 | 3 | row_ptr = threshold + in_row_offset * thresh_width; |
1070 | 3 | ptr_out_temp = ptr_out; |
1071 | | /* Left part */ |
1072 | 3 | memcpy(ptr_out_temp, row_ptr + dx, left_width); |
1073 | 3 | ptr_out_temp += left_width; |
1074 | | /* Now the full tiles */ |
1075 | 30 | for (ii = 0; ii < num_full_tiles; ii++) { |
1076 | 27 | memcpy(ptr_out_temp, row_ptr, thresh_width); |
1077 | 27 | ptr_out_temp += thresh_width; |
1078 | 27 | } |
1079 | | /* Now the remainder */ |
1080 | 3 | memcpy(ptr_out_temp, row_ptr, right_tile_width); |
1081 | 3 | ptr_out += LAND_BITS; |
1082 | 3 | } |
1083 | 1 | if (replicate_tile) { |
1084 | | /* Find out how many we need to copy */ |
1085 | 0 | num_tiles = |
1086 | 0 | (int)fastfloor((float) (dest_height - thresh_height)/ (float) thresh_height); |
1087 | 0 | tile_remainder = dest_height - (num_tiles + 1) * thresh_height; |
1088 | 0 | for (jj = 0; jj < num_tiles; jj ++) { |
1089 | 0 | memcpy(ptr_out, thresh_align, LAND_BITS * thresh_height); |
1090 | 0 | ptr_out += LAND_BITS * thresh_height; |
1091 | 0 | } |
1092 | | /* Now fill in the remainder */ |
1093 | 0 | memcpy(ptr_out, thresh_align, LAND_BITS * tile_remainder); |
1094 | 0 | } |
1095 | | /* Apply the threshold operation */ |
1096 | 1 | if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE |
1097 | 1 | && is_planar_dev) { |
1098 | 0 | gx_ht_threshold_landscape_sub(contone_align, thresh_align, |
1099 | 0 | &(penum->ht_landscape), halftone, dest_height); |
1100 | 1 | } else { |
1101 | 1 | gx_ht_threshold_landscape(contone_align, thresh_align, |
1102 | 1 | &(penum->ht_landscape), halftone, dest_height); |
1103 | 1 | } |
1104 | | /* We may have a line left over that has to be maintained |
1105 | | due to line replication in the resolution conversion. */ |
1106 | 1 | if (width != penum->ht_landscape.count) { |
1107 | | /* move the line do not reset the stuff */ |
1108 | 0 | move_landscape_buffer(&(penum->ht_landscape), |
1109 | 0 | contone_align, dest_height); |
1110 | 0 | } |
1111 | 1 | } |
1112 | | /* Perform the copy mono */ |
1113 | 1 | if (penum->ht_landscape.index < 0) { |
1114 | 0 | if (!is_planar_dev) { |
1115 | 0 | (*dev_proc(dev, copy_mono)) |
1116 | 0 | (dev, penum->ht_buffer, 0, LAND_BITS>>3, |
1117 | 0 | gx_no_bitmap_id, |
1118 | 0 | penum->ht_landscape.xstart - width + 1, |
1119 | 0 | penum->ht_landscape.y_pos, |
1120 | 0 | width, dest_height, |
1121 | 0 | dev_white, dev_black); |
1122 | 0 | } else { |
1123 | 0 | (*dev_proc(dev, copy_planes)) |
1124 | 0 | (dev, penum->ht_buffer, 0, LAND_BITS>>3, |
1125 | 0 | gx_no_bitmap_id, |
1126 | 0 | penum->ht_landscape.xstart - width + 1, |
1127 | 0 | penum->ht_landscape.y_pos, |
1128 | 0 | width, dest_height, |
1129 | 0 | penum->ht_plane_height); |
1130 | 0 | } |
1131 | 1 | } else { |
1132 | 1 | if (!is_planar_dev) { |
1133 | 1 | (*dev_proc(dev, copy_mono)) (dev, penum->ht_buffer, 0, LAND_BITS>>3, |
1134 | 1 | gx_no_bitmap_id, |
1135 | 1 | penum->ht_landscape.xstart, |
1136 | 1 | penum->ht_landscape.y_pos, |
1137 | 1 | width, dest_height, |
1138 | 1 | dev_white, dev_black); |
1139 | 1 | } else { |
1140 | 0 | (*dev_proc(dev, copy_planes)) (dev, penum->ht_buffer, 0, LAND_BITS>>3, |
1141 | 0 | gx_no_bitmap_id, |
1142 | 0 | penum->ht_landscape.xstart, |
1143 | 0 | penum->ht_landscape.y_pos, |
1144 | 0 | width, dest_height, |
1145 | 0 | penum->ht_plane_height); |
1146 | 0 | } |
1147 | 1 | } |
1148 | 1 | penum->ht_landscape.offset_set = false; |
1149 | 1 | if (width != penum->ht_landscape.count) { |
1150 | 0 | reset_landscape_buffer(&(penum->ht_landscape), |
1151 | 0 | contone_align, dest_height, |
1152 | 0 | width); |
1153 | 1 | } else { |
1154 | | /* Reset the whole buffer */ |
1155 | 1 | penum->ht_landscape.count = 0; |
1156 | 1 | if (penum->ht_landscape.index < 0) { |
1157 | | /* Going right to left */ |
1158 | 0 | penum->ht_landscape.curr_pos = LAND_BITS-1; |
1159 | 1 | } else { |
1160 | | /* Going left to right */ |
1161 | 1 | penum->ht_landscape.curr_pos = 0; |
1162 | 1 | } |
1163 | 1 | penum->ht_landscape.num_contones = 0; |
1164 | 1 | memset(&(penum->ht_landscape.widths[0]), 0, sizeof(int)*LAND_BITS); |
1165 | 1 | } |
1166 | 1 | } |
1167 | 2 | break; |
1168 | 2 | default: |
1169 | 0 | return gs_rethrow(-1, "Invalid orientation for thresholding"); |
1170 | 1.89M | } |
1171 | 1.89M | return 0; |
1172 | 1.89M | } |
1173 | | |
1174 | | int gxht_dda_length(gx_dda_fixed *dda, int src_size) |
1175 | 2.04M | { |
1176 | 2.04M | gx_dda_fixed d = (*dda); |
1177 | 2.04M | dda_advance(d, src_size); |
1178 | 2.04M | return abs(fixed2int_var_rounded(dda_current(d)) - fixed2int_var_rounded(dda_current(*dda))); |
1179 | 2.04M | } |