/src/mpv/video/out/gpu/error_diffusion.c
Line | Count | Source |
1 | | /* |
2 | | * This file is part of mpv. |
3 | | * |
4 | | * mpv is free software; you can redistribute it and/or |
5 | | * modify it under the terms of the GNU Lesser General Public |
6 | | * License as published by the Free Software Foundation; either |
7 | | * version 2.1 of the License, or (at your option) any later version. |
8 | | * |
9 | | * mpv is distributed in the hope that it will be useful, |
10 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | * GNU Lesser General Public License for more details. |
13 | | * |
14 | | * You should have received a copy of the GNU Lesser General Public |
15 | | * License along with mpv. If not, see <http://www.gnu.org/licenses/>. |
16 | | */ |
17 | | |
18 | | #include <stdlib.h> |
19 | | |
20 | | #include "error_diffusion.h" |
21 | | |
22 | | #include "common/common.h" |
23 | | |
24 | 0 | #define GLSL(...) gl_sc_addf(sc, __VA_ARGS__) |
25 | 0 | #define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__) |
26 | | |
27 | | // After a (y, x) -> (y, x + y * shift) mapping, find the right most column that |
28 | | // will be affected by the current column. |
29 | | static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k) |
30 | 0 | { |
31 | 0 | int ret = 0; |
32 | 0 | for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { |
33 | 0 | for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { |
34 | 0 | if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) { |
35 | 0 | int shifted_x = x + y * k->shift; |
36 | | |
37 | | // The shift mapping guarantees current column (or left of it) |
38 | | // won't be affected by error diffusion. |
39 | 0 | mp_assert(shifted_x > 0); |
40 | | |
41 | 0 | ret = MPMAX(ret, shifted_x); |
42 | 0 | } |
43 | 0 | } |
44 | 0 | } |
45 | 0 | return ret; |
46 | 0 | } |
47 | | |
48 | | const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name) |
49 | 1.54k | { |
50 | 1.54k | if (!name) |
51 | 0 | return NULL; |
52 | 1.54k | for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels; |
53 | 8.50k | k->name; |
54 | 8.08k | k++) { |
55 | 8.08k | if (strcmp(k->name, name) == 0) |
56 | 1.12k | return k; |
57 | 8.08k | } |
58 | 421 | return NULL; |
59 | 1.54k | } |
60 | | |
61 | | int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, |
62 | | int height) |
63 | 0 | { |
64 | | // We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors |
65 | | // propagated out from bottom side. |
66 | 0 | int rows = height + EF_MAX_DELTA_Y; |
67 | 0 | int shifted_columns = compute_rightmost_shifted_column(k) + 1; |
68 | | |
69 | | // The shared memory is an array of size rows*shifted_columns. Each element |
70 | | // is a single uint for three RGB component. |
71 | 0 | return rows * shifted_columns * 4; |
72 | 0 | } |
73 | | |
74 | | void pass_error_diffusion(struct gl_shader_cache *sc, |
75 | | const struct error_diffusion_kernel *k, |
76 | | int tex, int width, int height, int depth, int block_size) |
77 | 0 | { |
78 | 0 | mp_assert(block_size <= height); |
79 | | |
80 | | // The parallel error diffusion works by applying the shift mapping first. |
81 | | // Taking the Floyd and Steinberg algorithm for example. After applying |
82 | | // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are |
83 | | // propagated into the next few columns, which makes parallel processing on |
84 | | // the same column possible. |
85 | | // |
86 | | // X 7/16 X 7/16 |
87 | | // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 |
88 | | |
89 | | // Figuring out the size of rectangle containing all shifted pixels. |
90 | | // The rectangle height is not changed. |
91 | 0 | int shifted_width = width + (height - 1) * k->shift; |
92 | | |
93 | | // We process all pixels from the shifted rectangles column by column, with |
94 | | // a single global work group of size |block_size|. |
95 | | // Figuring out how many block are required to process all pixels. We need |
96 | | // this explicitly to make the number of barrier() calls match. |
97 | 0 | int blocks = (height * shifted_width + block_size - 1) / block_size; |
98 | | |
99 | | // If we figure out how many of the next columns will be affected while the |
100 | | // current columns is being processed. We can store errors of only a few |
101 | | // columns in the shared memory. Using a ring buffer will further save the |
102 | | // cost while iterating to next column. |
103 | 0 | int ring_buffer_rows = height + EF_MAX_DELTA_Y; |
104 | 0 | int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1; |
105 | 0 | int ring_buffer_size = ring_buffer_rows * ring_buffer_columns; |
106 | | |
107 | | // Defines the ring buffer in shared memory. |
108 | 0 | GLSLH("shared uint err_rgb8[%d];\n", ring_buffer_size); |
109 | | |
110 | | // Initialize the ring buffer. |
111 | 0 | GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) ", |
112 | 0 | ring_buffer_size, block_size); |
113 | 0 | GLSL("err_rgb8[i] = 0u;\n"); |
114 | |
|
115 | 0 | GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks); |
116 | | |
117 | | // Add barrier here to have previous block all processed before starting |
118 | | // the processing of the next. |
119 | 0 | GLSL("groupMemoryBarrier();\n"); |
120 | 0 | GLSL("barrier();\n"); |
121 | | |
122 | | // Compute the coordinate of the pixel we are currently processing, both |
123 | | // before and after the shift mapping. |
124 | 0 | GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size); |
125 | 0 | GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height); |
126 | 0 | GLSL("int x = x_shifted - y * %d;\n", k->shift); |
127 | | |
128 | | // Proceed only if we are processing a valid pixel. |
129 | 0 | GLSL("if (0 <= x && x < %d) {\n", width); |
130 | | |
131 | | // The index that the current pixel have on the ring buffer. |
132 | 0 | GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size); |
133 | | |
134 | | // Fetch the current pixel. |
135 | 0 | GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex); |
136 | | |
137 | | // The dithering will quantize pixel value into multiples of 1/dither_quant. |
138 | 0 | int dither_quant = (1 << depth) - 1; |
139 | | |
140 | | // We encode errors in RGB components into a single 32-bit unsigned integer. |
141 | | // The error we propagate from the current pixel is in range of |
142 | | // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the |
143 | | // sum of all errors been propagated into a pixel is also in the same range. |
144 | | // It's possible to map errors in this range into [-127, 127], and use an |
145 | | // unsigned 8-bit integer to store it (using standard two's complement). |
146 | | // The three 8-bit unsigned integers can then be encoded into a single |
147 | | // 32-bit unsigned integer, with two 4-bit padding to prevent addition |
148 | | // operation overflows affecting other component. There are at most 12 |
149 | | // addition operations on each pixel, so 4-bit padding should be enough. |
150 | | // The overflow from R component will be discarded. |
151 | | // |
152 | | // The following figure is how the encoding looks like. |
153 | | // |
154 | | // +------------------------------------+ |
155 | | // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB| |
156 | | // +------------------------------------+ |
157 | | // |
158 | | |
159 | | // The bitshift position for R and G component. |
160 | 0 | int bitshift_r = 24, bitshift_g = 12; |
161 | | // The multiplier we use to map [-0.5, 0.5] to [-127, 127]. |
162 | 0 | int uint8_mul = 127 * 2; |
163 | | |
164 | | // Adding the error previously propagated into current pixel, and clear it |
165 | | // in the buffer. |
166 | 0 | GLSL("uint err_u32 = err_rgb8[idx] + %uu;\n", |
167 | 0 | (128u << bitshift_r) | (128u << bitshift_g) | 128u); |
168 | 0 | GLSL("pix = pix * %d.0 + vec3(" |
169 | 0 | "int((err_u32 >> %d) & 255u) - 128," |
170 | 0 | "int((err_u32 >> %d) & 255u) - 128," |
171 | 0 | "int( err_u32 & 255u) - 128" |
172 | 0 | ") / %d.0;\n", dither_quant, bitshift_r, bitshift_g, uint8_mul); |
173 | 0 | GLSL("err_rgb8[idx] = 0u;\n"); |
174 | | |
175 | | // Write the dithered pixel. |
176 | 0 | GLSL("vec3 dithered = round(pix);\n"); |
177 | 0 | GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered / %d.0, 0.0));\n", |
178 | 0 | dither_quant); |
179 | |
|
180 | 0 | GLSL("vec3 err_divided = (pix - dithered) * %d.0 / %d.0;\n", |
181 | 0 | uint8_mul, k->divisor); |
182 | 0 | GLSL("ivec3 tmp;\n"); |
183 | | |
184 | | // Group error propagation with same weight factor together, in order to |
185 | | // reduce the number of annoying error encoding. |
186 | 0 | for (int dividend = 1; dividend <= k->divisor; dividend++) { |
187 | 0 | bool err_assigned = false; |
188 | |
|
189 | 0 | for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { |
190 | 0 | for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { |
191 | 0 | if (k->pattern[y][x - EF_MIN_DELTA_X] != dividend) |
192 | 0 | continue; |
193 | | |
194 | 0 | if (!err_assigned) { |
195 | 0 | err_assigned = true; |
196 | |
|
197 | 0 | GLSL("tmp = ivec3(round(err_divided * %d.0));\n", dividend); |
198 | |
|
199 | 0 | GLSL("err_u32 = " |
200 | 0 | "(uint(tmp.r & 255) << %d)|" |
201 | 0 | "(uint(tmp.g & 255) << %d)|" |
202 | 0 | " uint(tmp.b & 255);\n", |
203 | 0 | bitshift_r, bitshift_g); |
204 | 0 | } |
205 | |
|
206 | 0 | int shifted_x = x + y * k->shift; |
207 | | |
208 | | // Unlike the right border, errors propagated out from left |
209 | | // border will remain in the ring buffer. This will produce |
210 | | // visible artifacts near the left border, especially for |
211 | | // shift=3 kernels. |
212 | 0 | if (x < 0) |
213 | 0 | GLSL("if (x >= %d) ", -x); |
214 | | |
215 | | // Calculate the new position in the ring buffer to propagate |
216 | | // the error into. |
217 | 0 | int ring_buffer_delta = shifted_x * ring_buffer_rows + y; |
218 | 0 | GLSL("atomicAdd(err_rgb8[(idx + %d) %% %d], err_u32);\n", |
219 | 0 | ring_buffer_delta, ring_buffer_size); |
220 | 0 | } |
221 | 0 | } |
222 | 0 | } |
223 | |
|
224 | 0 | GLSL("}\n"); // if (0 <= x && x < width) |
225 | |
|
226 | 0 | GLSL("}\n"); // block_id |
227 | 0 | } |
228 | | |
229 | | // Different kernels for error diffusion. |
230 | | // Patterns are from <https://web.archive.org/web/20181031005427/ |
231 | | // http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT> |
232 | | const struct error_diffusion_kernel mp_error_diffusion_kernels[] = { |
233 | | { |
234 | | .name = "simple", |
235 | | .shift = 1, |
236 | | .pattern = {{0, 0, 0, 1, 0}, |
237 | | {0, 0, 1, 0, 0}, |
238 | | {0, 0, 0, 0, 0}}, |
239 | | .divisor = 2 |
240 | | }, |
241 | | { |
242 | | // The "false" Floyd-Steinberg kernel |
243 | | .name = "false-fs", |
244 | | .shift = 1, |
245 | | .pattern = {{0, 0, 0, 3, 0}, |
246 | | {0, 0, 3, 2, 0}, |
247 | | {0, 0, 0, 0, 0}}, |
248 | | .divisor = 8 |
249 | | }, |
250 | | { |
251 | | .name = "sierra-lite", |
252 | | .shift = 2, |
253 | | .pattern = {{0, 0, 0, 2, 0}, |
254 | | {0, 1, 1, 0, 0}, |
255 | | {0, 0, 0, 0, 0}}, |
256 | | .divisor = 4 |
257 | | }, |
258 | | { |
259 | | .name = "floyd-steinberg", |
260 | | .shift = 2, |
261 | | .pattern = {{0, 0, 0, 7, 0}, |
262 | | {0, 3, 5, 1, 0}, |
263 | | {0, 0, 0, 0, 0}}, |
264 | | .divisor = 16 |
265 | | }, |
266 | | { |
267 | | .name = "atkinson", |
268 | | .shift = 2, |
269 | | .pattern = {{0, 0, 0, 1, 1}, |
270 | | {0, 1, 1, 1, 0}, |
271 | | {0, 0, 1, 0, 0}}, |
272 | | .divisor = 8 |
273 | | }, |
274 | | // All kernels below have shift value of 3, and probably are too heavy for |
275 | | // low end GPU. |
276 | | { |
277 | | .name = "jarvis-judice-ninke", |
278 | | .shift = 3, |
279 | | .pattern = {{0, 0, 0, 7, 5}, |
280 | | {3, 5, 7, 5, 3}, |
281 | | {1, 3, 5, 3, 1}}, |
282 | | .divisor = 48 |
283 | | }, |
284 | | { |
285 | | .name = "stucki", |
286 | | .shift = 3, |
287 | | .pattern = {{0, 0, 0, 8, 4}, |
288 | | {2, 4, 8, 4, 2}, |
289 | | {1, 2, 4, 2, 1}}, |
290 | | .divisor = 42 |
291 | | }, |
292 | | { |
293 | | .name = "burkes", |
294 | | .shift = 3, |
295 | | .pattern = {{0, 0, 0, 8, 4}, |
296 | | {2, 4, 8, 4, 2}, |
297 | | {0, 0, 0, 0, 0}}, |
298 | | .divisor = 32 |
299 | | }, |
300 | | { |
301 | | .name = "sierra-3", |
302 | | .shift = 3, |
303 | | .pattern = {{0, 0, 0, 5, 3}, |
304 | | {2, 4, 5, 4, 2}, |
305 | | {0, 2, 3, 2, 0}}, |
306 | | .divisor = 32 |
307 | | }, |
308 | | { |
309 | | .name = "sierra-2", |
310 | | .shift = 3, |
311 | | .pattern = {{0, 0, 0, 4, 3}, |
312 | | {1, 2, 3, 2, 1}, |
313 | | {0, 0, 0, 0, 0}}, |
314 | | .divisor = 16 |
315 | | }, |
316 | | {0} |
317 | | }; |