/src/aom/av1/encoder/cnn.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2019, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <assert.h> |
13 | | #include <math.h> |
14 | | |
15 | | #include "aom_dsp/aom_dsp_common.h" |
16 | | #include "av1/common/av1_common_int.h" |
17 | | #include "av1/encoder/cnn.h" |
18 | | |
19 | 0 | #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a))) |
20 | | |
21 | | typedef struct { |
22 | | const float **input; |
23 | | int in_width; |
24 | | int in_height; |
25 | | int in_stride; |
26 | | const CNN_LAYER_CONFIG *layer_config; |
27 | | float **output; |
28 | | int out_stride; |
29 | | int start_idx; |
30 | | int th_step; |
31 | | } CONVOLVE_OPS; |
32 | | |
33 | | typedef float (*activation_fn)(float); |
34 | | |
35 | 0 | static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); } |
36 | | |
37 | 0 | static float relu(float x) { return (x < 0) ? 0 : x; } |
38 | | |
39 | 0 | static float identity(float x) { return x; } |
40 | | |
41 | | typedef struct { |
42 | | int allocsize; |
43 | | int channels; |
44 | | int width, height, stride; |
45 | | float *buf[CNN_MAX_CHANNELS]; |
46 | | } TENSOR; |
47 | | |
48 | 0 | static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); } |
49 | | |
50 | 0 | static void free_tensor(TENSOR *tensor) { |
51 | 0 | if (tensor->allocsize) { |
52 | 0 | aom_free(tensor->buf[0]); |
53 | 0 | tensor->buf[0] = NULL; |
54 | 0 | tensor->allocsize = 0; |
55 | 0 | } |
56 | 0 | } |
57 | | |
58 | | static void realloc_tensor(TENSOR *tensor, int channels, int width, |
59 | 0 | int height) { |
60 | 0 | const int newallocsize = channels * width * height; |
61 | 0 | if (tensor->allocsize < newallocsize) { |
62 | 0 | free_tensor(tensor); |
63 | 0 | tensor->buf[0] = |
64 | 0 | (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize); |
65 | 0 | tensor->allocsize = newallocsize; |
66 | 0 | } |
67 | 0 | tensor->width = width; |
68 | 0 | tensor->height = height; |
69 | 0 | tensor->stride = width; |
70 | 0 | tensor->channels = channels; |
71 | 0 | for (int c = 1; c < channels; ++c) |
72 | 0 | tensor->buf[c] = &tensor->buf[0][c * width * height]; |
73 | 0 | } |
74 | | |
75 | | static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset, |
76 | 0 | TENSOR *dst) { |
77 | 0 | assert(src->width == dst->width); |
78 | 0 | assert(src->height == dst->height); |
79 | 0 | assert(copy_channels <= src->channels); |
80 | 0 | if (src->stride == dst->width && dst->stride == dst->width) { |
81 | 0 | for (int c = 0; c < copy_channels; ++c) { |
82 | 0 | memcpy(dst->buf[dst_offset + c], src->buf[c], |
83 | 0 | sizeof(*dst->buf[0]) * src->width * src->height); |
84 | 0 | } |
85 | 0 | } else { |
86 | 0 | for (int c = 0; c < copy_channels; ++c) { |
87 | 0 | for (int r = 0; r < dst->height; ++r) { |
88 | 0 | memcpy(&dst->buf[dst_offset + c][r * dst->stride], |
89 | 0 | &src->buf[c][r * src->stride], |
90 | 0 | dst->width * sizeof(*dst->buf[c])); |
91 | 0 | } |
92 | 0 | } |
93 | 0 | } |
94 | 0 | } |
95 | | |
96 | | static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS], |
97 | 0 | int channels, int width, int height, int stride) { |
98 | 0 | tensor->allocsize = 0; |
99 | 0 | tensor->channels = channels; |
100 | 0 | tensor->width = width; |
101 | 0 | tensor->height = height; |
102 | 0 | tensor->stride = stride; |
103 | 0 | if (buf) { |
104 | 0 | for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c]; |
105 | 0 | } else { |
106 | 0 | for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL; |
107 | 0 | } |
108 | 0 | } |
109 | | |
110 | 0 | static void swap_tensor(TENSOR *t1, TENSOR *t2) { |
111 | 0 | TENSOR t = *t1; |
112 | 0 | *t1 = *t2; |
113 | 0 | *t2 = t; |
114 | 0 | } |
115 | | |
116 | | // The concatenated tensor goes into dst with first the channels in |
117 | | // original dst followed by the channels in the src |
118 | 0 | static void concat_tensor(const TENSOR *src, TENSOR *dst) { |
119 | 0 | assert(src->width == dst->width); |
120 | 0 | assert(src->height == dst->height); |
121 | |
|
122 | 0 | const int dst_channels = dst->channels; |
123 | 0 | const int channels = dst->channels + src->channels; |
124 | 0 | const int newallocsize = channels * dst->width * dst->height; |
125 | 0 | if (dst->allocsize < newallocsize) { |
126 | 0 | TENSOR t; |
127 | 0 | init_tensor(&t); |
128 | | // allocate new buffers and copy first the dst channels |
129 | 0 | realloc_tensor(&t, channels, dst->width, dst->height); |
130 | 0 | copy_tensor(dst, dst->channels, 0, &t); |
131 | | // Swap the tensors and free the old buffers |
132 | 0 | swap_tensor(dst, &t); |
133 | 0 | free_tensor(&t); |
134 | 0 | } |
135 | 0 | for (int c = 1; c < channels; ++c) |
136 | 0 | dst->buf[c] = &dst->buf[0][c * dst->width * dst->height]; |
137 | | // Copy the channels in src after the first dst_channels channels. |
138 | 0 | copy_tensor(src, src->channels, dst_channels, dst); |
139 | 0 | } |
140 | | |
141 | 0 | int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { |
142 | 0 | return (t1->width == t2->width && t1->height == t2->height); |
143 | 0 | } |
144 | | |
145 | 0 | int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { |
146 | 0 | return (t1->channels == t2->channels && t1->width == t2->width && |
147 | 0 | t1->height == t2->height); |
148 | 0 | } |
149 | | |
150 | | void av1_find_cnn_layer_output_size(int in_width, int in_height, |
151 | | const CNN_LAYER_CONFIG *layer_config, |
152 | 0 | int *out_width, int *out_height) { |
153 | 0 | if (!layer_config->deconvolve) { |
154 | 0 | switch (layer_config->pad) { |
155 | 0 | case PADDING_SAME_ZERO: |
156 | 0 | case PADDING_SAME_REPLICATE: |
157 | 0 | *out_width = (in_width + layer_config->skip_width - 1) / |
158 | 0 | layer_config->skip_width; |
159 | 0 | *out_height = (in_height + layer_config->skip_height - 1) / |
160 | 0 | layer_config->skip_height; |
161 | 0 | break; |
162 | 0 | case PADDING_VALID: |
163 | 0 | *out_width = |
164 | 0 | (in_width - layer_config->filter_width + layer_config->skip_width) / |
165 | 0 | layer_config->skip_width; |
166 | 0 | *out_height = (in_height - layer_config->filter_height + |
167 | 0 | layer_config->skip_height) / |
168 | 0 | layer_config->skip_height; |
169 | 0 | break; |
170 | 0 | default: assert(0 && "Unknown padding type"); |
171 | 0 | } |
172 | 0 | } else { |
173 | 0 | switch (layer_config->pad) { |
174 | 0 | case PADDING_SAME_ZERO: |
175 | 0 | case PADDING_SAME_REPLICATE: |
176 | 0 | *out_width = in_width * layer_config->skip_width; |
177 | 0 | *out_height = in_height * layer_config->skip_height; |
178 | 0 | break; |
179 | 0 | case PADDING_VALID: |
180 | 0 | *out_width = (in_width - 1) * layer_config->skip_width + |
181 | 0 | layer_config->filter_width; |
182 | 0 | *out_height = (in_height - 1) * layer_config->skip_height + |
183 | 0 | layer_config->filter_height; |
184 | 0 | break; |
185 | 0 | default: assert(0 && "Unknown padding type"); |
186 | 0 | } |
187 | 0 | } |
188 | 0 | } |
189 | | |
190 | | void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, |
191 | 0 | int channels_per_branch[]) { |
192 | 0 | int branch = layer_config->branch; |
193 | 0 | const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; |
194 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
195 | 0 | if ((branch_config->input_to_branches & (1 << b)) && b != branch) { |
196 | 0 | if (layer_config->branch_copy_type == BRANCH_INPUT) { |
197 | 0 | channels_per_branch[b] = layer_config->in_channels; |
198 | 0 | } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) { |
199 | 0 | channels_per_branch[b] = layer_config->out_channels; |
200 | 0 | } else if (layer_config->branch_copy_type == BRANCH_COMBINED) { |
201 | 0 | channels_per_branch[b] = layer_config->out_channels; |
202 | 0 | for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { |
203 | 0 | if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { |
204 | 0 | assert(channels_per_branch[c] > 0); |
205 | 0 | channels_per_branch[b] += channels_per_branch[c]; |
206 | 0 | } |
207 | 0 | } |
208 | 0 | } |
209 | 0 | } |
210 | 0 | } |
211 | 0 | channels_per_branch[branch] = layer_config->out_channels; |
212 | 0 | for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { |
213 | 0 | if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { |
214 | 0 | assert(channels_per_branch[c] > 0); |
215 | 0 | channels_per_branch[branch] += channels_per_branch[c]; |
216 | 0 | } |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | | #if CONFIG_DEBUG |
221 | | static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) { |
222 | | const int num_layers = cnn_config->num_layers; |
223 | | const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config; |
224 | | |
225 | | for (int idx = 0; idx < num_layers; idx++) { |
226 | | if (layer_configs[idx].output_num != -1) { |
227 | | return 1; |
228 | | } |
229 | | } |
230 | | return 0; |
231 | | } |
232 | | #endif |
233 | | |
234 | | void av1_find_cnn_output_size(int in_width, int in_height, |
235 | | const CNN_CONFIG *cnn_config, int *out_width, |
236 | 0 | int *out_height, int *out_channels) { |
237 | 0 | int channels_per_branch[CNN_MAX_BRANCHES] = { 0 }; |
238 | 0 | int i_width[CNN_MAX_BRANCHES] = { 0 }; |
239 | 0 | int i_height[CNN_MAX_BRANCHES] = { 0 }; |
240 | 0 | i_width[0] = in_width + cnn_config->ext_width * 2; |
241 | 0 | i_height[0] = in_height + cnn_config->ext_height * 2; |
242 | |
|
243 | | #if CONFIG_DEBUG |
244 | | assert(cnn_has_at_least_one_output(cnn_config)); |
245 | | #endif |
246 | |
|
247 | 0 | for (int i = 0; i < cnn_config->num_layers; ++i) { |
248 | 0 | const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i]; |
249 | 0 | const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; |
250 | 0 | const int branch = layer_config->branch; |
251 | 0 | int o_width = 0, o_height = 0; |
252 | |
|
253 | 0 | if (layer_config->branch_copy_type == BRANCH_INPUT) { |
254 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
255 | 0 | if ((branch_config->input_to_branches & (1 << b)) && b != branch) { |
256 | 0 | assert(i_width[branch] > 0 && i_height[branch] > 0); |
257 | 0 | i_width[b] = i_width[branch]; |
258 | 0 | i_height[b] = i_height[branch]; |
259 | 0 | } |
260 | 0 | } |
261 | 0 | } |
262 | |
|
263 | 0 | av1_find_cnn_layer_output_size(i_width[branch], i_height[branch], |
264 | 0 | layer_config, &o_width, &o_height); |
265 | 0 | i_width[branch] = o_width; |
266 | 0 | i_height[branch] = o_height; |
267 | |
|
268 | 0 | if (layer_config->branch_copy_type == BRANCH_OUTPUT) { |
269 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
270 | 0 | if ((branch_config->input_to_branches & (1 << b)) && b != branch) { |
271 | 0 | i_width[b] = o_width; |
272 | 0 | i_height[b] = o_height; |
273 | 0 | } |
274 | 0 | } |
275 | 0 | } |
276 | |
|
277 | 0 | find_cnn_out_channels(layer_config, channels_per_branch); |
278 | |
|
279 | 0 | const int output_num = layer_config->output_num; |
280 | 0 | if (output_num != -1) { // Current layer is an output layer |
281 | 0 | out_width[output_num] = o_width; |
282 | 0 | out_height[output_num] = o_height; |
283 | 0 | out_channels[output_num] = channels_per_branch[layer_config->branch]; |
284 | 0 | } |
285 | 0 | } |
286 | 0 | } |
287 | | |
288 | 0 | activation_fn get_activation(ACTIVATION layer_activation) { |
289 | 0 | switch (layer_activation) { |
290 | 0 | case NONE: return identity; |
291 | 0 | case RELU: return relu; |
292 | 0 | case SOFTSIGN: return softsign; |
293 | 0 | case SIGMOID: |
294 | 0 | assert(0 && "Sigmoid has not been supported in CNN."); // TO DO |
295 | 0 | return NULL; |
296 | 0 | default: assert(0 && "Unknown activation type"); return NULL; |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | static INLINE int get_start_shift_convolve(int width, int filt_width, |
301 | 0 | int stride) { |
302 | 0 | const int mod = (width % stride); |
303 | 0 | const int filt_off = (filt_width - 1) / 2; |
304 | 0 | const int dif = (mod ? mod - 1 : stride - 1); |
305 | 0 | return AOMMIN((dif + (filt_width % 2)) / 2, filt_off); |
306 | 0 | } |
307 | | |
308 | | void av1_cnn_add_c(float **output, int channels, int width, int height, |
309 | 0 | int stride, const float **add) { |
310 | 0 | for (int c = 0; c < channels; ++c) { |
311 | 0 | for (int i = 0; i < height; ++i) |
312 | 0 | for (int j = 0; j < width; ++j) |
313 | 0 | output[c][i * stride + j] += add[c][i * stride + j]; |
314 | 0 | } |
315 | 0 | } |
316 | | |
317 | | void av1_cnn_activate_c(float **output, int channels, int width, int height, |
318 | 0 | int stride, ACTIVATION layer_activation) { |
319 | 0 | activation_fn activation = get_activation(layer_activation); |
320 | 0 | for (int c = 0; c < channels; ++c) { |
321 | 0 | for (int i = 0; i < height; ++i) |
322 | 0 | for (int j = 0; j < width; ++j) |
323 | 0 | output[c][i * stride + j] = activation(output[c][i * stride + j]); |
324 | 0 | } |
325 | 0 | } |
326 | | |
327 | | static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor, |
328 | | const CNN_LAYER_CONFIG *layer_config, |
329 | 0 | int branch, TENSOR branch_output[]) { |
330 | 0 | const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; |
331 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
332 | 0 | if ((branch_config->input_to_branches & (1 << b)) && b != branch) { |
333 | | // Copy layer's active tensor to output tensor of branch b if set in |
334 | | // mask. The output becomes the input of the first layer of the branch |
335 | | // because the layer of the branch is not the first layer. |
336 | 0 | int copy_channels = branch_config->channels_to_copy > 0 |
337 | 0 | ? branch_config->channels_to_copy |
338 | 0 | : layer_active_tensor->channels; |
339 | 0 | realloc_tensor(&branch_output[b], copy_channels, |
340 | 0 | layer_active_tensor->width, layer_active_tensor->height); |
341 | 0 | copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]); |
342 | 0 | } |
343 | 0 | } |
344 | 0 | } |
345 | | |
346 | | // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height |
347 | | // greater than 1 and padding equal to PADDING_SAME_ZERO. |
348 | | static void convolve_maxpool_padding_zero( |
349 | | const float **input, int in_width, int in_height, int in_stride, |
350 | | const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, |
351 | | const int cstep, const int filter_width_half, |
352 | 0 | const int filter_height_half) { |
353 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
354 | 0 | for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { |
355 | 0 | for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { |
356 | 0 | for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); |
357 | 0 | ++hh) { |
358 | 0 | for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); |
359 | 0 | ++ww) { |
360 | 0 | float sum = layer_config->bias[i]; |
361 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
362 | 0 | int off = k * layer_config->out_channels + i; |
363 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
364 | 0 | const int ii = hh + l - filter_height_half; |
365 | 0 | for (int m = 0; m < layer_config->filter_width; |
366 | 0 | ++m, off += cstep) { |
367 | 0 | const int jj = ww + m - filter_width_half; |
368 | 0 | if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) |
369 | 0 | continue; |
370 | 0 | sum += layer_config->weights[off] * |
371 | 0 | input[k][ii * in_stride + jj]; |
372 | 0 | } |
373 | 0 | } |
374 | 0 | } |
375 | 0 | const float a = sum; |
376 | 0 | if (h == hh && w == ww) |
377 | 0 | output[i][u * out_stride + v] = a; |
378 | 0 | else |
379 | 0 | output[i][u * out_stride + v] = |
380 | 0 | AOMMAX(output[i][u * out_stride + v], a); |
381 | 0 | } |
382 | 0 | } |
383 | 0 | } |
384 | 0 | } |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height |
389 | | // greater than 1 and padding equal to PADDING_SAME_REPLICATE. |
390 | | static void convolve_maxpool_padding_replicate( |
391 | | const float **input, int in_width, int in_height, int in_stride, |
392 | | const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, |
393 | | const int cstep, const int filter_width_half, |
394 | 0 | const int filter_height_half) { |
395 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
396 | 0 | for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { |
397 | 0 | for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { |
398 | 0 | for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); |
399 | 0 | ++hh) { |
400 | 0 | for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); |
401 | 0 | ++ww) { |
402 | 0 | float sum = layer_config->bias[i]; |
403 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
404 | 0 | int off = k * layer_config->out_channels + i; |
405 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
406 | 0 | const int ii = |
407 | 0 | CLAMPINDEX(hh + l - filter_height_half, in_height); |
408 | 0 | for (int m = 0; m < layer_config->filter_width; |
409 | 0 | ++m, off += cstep) { |
410 | 0 | const int jj = |
411 | 0 | CLAMPINDEX(ww + m - filter_width_half, in_width); |
412 | 0 | assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); |
413 | 0 | sum += layer_config->weights[off] * |
414 | 0 | input[k][ii * in_stride + jj]; |
415 | 0 | } |
416 | 0 | } |
417 | 0 | } |
418 | 0 | const float a = sum; |
419 | 0 | if (h == hh && w == ww) |
420 | 0 | output[i][u * out_stride + v] = a; |
421 | 0 | else |
422 | 0 | output[i][u * out_stride + v] = |
423 | 0 | AOMMAX(output[i][u * out_stride + v], a); |
424 | 0 | } |
425 | 0 | } |
426 | 0 | } |
427 | 0 | } |
428 | 0 | } |
429 | 0 | } |
430 | | |
431 | | // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height |
432 | | // greater than 1 and padding equal to PADDING_VALID. |
433 | | static void convolve_maxpool_padding_valid( |
434 | | const float **input, int in_width, int in_height, int in_stride, |
435 | | const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, |
436 | 0 | const int cstep) { |
437 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
438 | 0 | for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; |
439 | 0 | h += layer_config->skip_height, ++u) { |
440 | 0 | for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1; |
441 | 0 | w += layer_config->skip_width, ++v) { |
442 | 0 | for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); |
443 | 0 | ++hh) { |
444 | 0 | for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); |
445 | 0 | ++ww) { |
446 | 0 | float sum = layer_config->bias[i]; |
447 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
448 | 0 | int off = k * layer_config->out_channels + i; |
449 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
450 | 0 | const int ii = hh + l; |
451 | 0 | for (int m = 0; m < layer_config->filter_width; |
452 | 0 | ++m, off += cstep) { |
453 | 0 | const int jj = ww + m; |
454 | 0 | assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); |
455 | 0 | sum += layer_config->weights[off] * |
456 | 0 | input[k][ii * in_stride + jj]; |
457 | 0 | } |
458 | 0 | } |
459 | 0 | } |
460 | 0 | const float a = sum; |
461 | 0 | if (h == hh && w == ww) |
462 | 0 | output[i][u * out_stride + v] = a; |
463 | 0 | else |
464 | 0 | output[i][u * out_stride + v] = |
465 | 0 | AOMMAX(output[i][u * out_stride + v], a); |
466 | 0 | } |
467 | 0 | } |
468 | 0 | } |
469 | 0 | } |
470 | 0 | } |
471 | 0 | } |
472 | | |
473 | | // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width |
474 | | // equal to 1. |
475 | | static void convolve_element_wise(const float **input, int in_width, |
476 | | int in_height, int in_stride, |
477 | | const CNN_LAYER_CONFIG *const layer_config, |
478 | | float **output, int out_stride, int start_idx, |
479 | 0 | int step) { |
480 | 0 | const int start_h = get_start_shift_convolve( |
481 | 0 | in_height, layer_config->filter_height, layer_config->skip_height); |
482 | 0 | const int start_w = |
483 | 0 | get_start_shift_convolve(in_width, layer_config->filter_width, |
484 | 0 | layer_config->skip_width) + |
485 | 0 | start_idx * layer_config->skip_width; |
486 | 0 | const int out_w_step = AOMMAX(step, 1); |
487 | 0 | const int in_w_step = layer_config->skip_width * out_w_step; |
488 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
489 | 0 | for (int h = start_h, u = 0; h < in_height; |
490 | 0 | h += layer_config->skip_height, ++u) { |
491 | 0 | const int in_h = h * in_stride; |
492 | 0 | const int out_h = u * out_stride + start_idx; |
493 | 0 | for (int w = start_w, out_index = out_h; w < in_width; |
494 | 0 | w += in_w_step, out_index += out_w_step) { |
495 | 0 | float sum = layer_config->bias[i]; |
496 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
497 | 0 | sum += layer_config->weights[k * layer_config->out_channels + i] * |
498 | 0 | input[k][in_h + w]; |
499 | 0 | } |
500 | 0 | output[i][out_index] = sum; |
501 | 0 | } |
502 | 0 | } |
503 | 0 | } |
504 | 0 | } |
505 | | |
506 | | // CNNConvolve specific to maxpool set as 0 and padding equal to |
507 | | // PADDING_SAME_ZERO. |
508 | | static void convolve_no_maxpool_padding_zero( |
509 | | const float **input, int in_width, int in_height, int in_stride, |
510 | | const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, |
511 | | int start_idx, const int cstep, const int filter_width_half, |
512 | | const int filter_height_half, const int ii_shift, const int jj_shift, |
513 | 0 | const int channel_step) { |
514 | 0 | const int start_h = get_start_shift_convolve( |
515 | 0 | in_height, layer_config->filter_height, layer_config->skip_height); |
516 | 0 | const int start_w = get_start_shift_convolve( |
517 | 0 | in_width, layer_config->filter_width, layer_config->skip_width); |
518 | 0 | const int end_ii_shift = filter_height_half + 1; |
519 | 0 | const int end_jj_shift = filter_width_half + 1; |
520 | | // *_filter_margin stores the number of pixels along a dimension in the |
521 | | // intersection of the complement of the image in the extended image |
522 | | // and the filter. |
523 | 0 | const int top_filter_margin = layer_config->filter_width * ii_shift; |
524 | 0 | const int right_filter_margin = end_jj_shift - in_width; |
525 | 0 | for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { |
526 | 0 | for (int h = start_h, u = 0; h < in_height; |
527 | 0 | h += layer_config->skip_height, ++u) { |
528 | 0 | const int out_h = u * out_stride; |
529 | 0 | const int top_cstep = |
530 | 0 | AOMMAX(0, top_filter_margin - h * layer_config->filter_width) * |
531 | 0 | cstep + |
532 | 0 | i; |
533 | 0 | const int start_ii = AOMMAX(0, h - ii_shift); |
534 | 0 | const int end_ii = AOMMIN(in_height, h + end_ii_shift); |
535 | 0 | for (int w = start_w, out_index = out_h; w < in_width; |
536 | 0 | w += layer_config->skip_width, ++out_index) { |
537 | 0 | const int left_cstep = AOMMAX(0, jj_shift - w) * cstep; |
538 | 0 | const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep; |
539 | 0 | const int start_jj = AOMMAX(0, w - jj_shift); |
540 | 0 | const int end_jj = AOMMIN(in_width, w + end_jj_shift); |
541 | 0 | float sum = layer_config->bias[i]; |
542 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
543 | 0 | int off = k * layer_config->out_channels + top_cstep; |
544 | 0 | for (int ii = start_ii; ii < end_ii; ++ii) { |
545 | 0 | off += left_cstep; |
546 | 0 | for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) { |
547 | 0 | sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; |
548 | 0 | } |
549 | 0 | off += right_cstep; |
550 | 0 | } |
551 | 0 | } |
552 | 0 | output[i][out_index] = sum; |
553 | 0 | } |
554 | 0 | } |
555 | 0 | } |
556 | 0 | } |
557 | | |
558 | | // CNNConvolve specific to maxpool set as 0 and padding equal to |
559 | | // PADDING_SAME_REPLICATE. |
560 | | static void convolve_no_maxpool_padding_replicate( |
561 | | const float **input, int in_width, int in_height, int in_stride, |
562 | | const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, |
563 | | int start_idx, const int cstep, const int ii_shift, const int jj_shift, |
564 | 0 | const int channel_step) { |
565 | | // h and w are shifted to an offset coordinate system to reduce in-loop |
566 | | // computation. |
567 | 0 | const int start_h = |
568 | 0 | get_start_shift_convolve(in_height, layer_config->filter_height, |
569 | 0 | layer_config->skip_height) - |
570 | 0 | ii_shift; |
571 | 0 | const int start_w = |
572 | 0 | get_start_shift_convolve(in_width, layer_config->filter_width, |
573 | 0 | layer_config->skip_width) - |
574 | 0 | jj_shift; |
575 | 0 | const int end_h = in_height - ii_shift; |
576 | 0 | const int end_w = in_width - jj_shift; |
577 | 0 | for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { |
578 | 0 | for (int h = start_h, u = 0; h < end_h; |
579 | 0 | h += layer_config->skip_height, ++u) { |
580 | 0 | const int out_h = u * out_stride; |
581 | 0 | const int upper_ii_index = layer_config->filter_height + h; |
582 | 0 | for (int w = start_w, out_index = out_h; w < end_w; |
583 | 0 | w += layer_config->skip_width, ++out_index) { |
584 | 0 | const int upper_jj_index = layer_config->filter_width + w; |
585 | 0 | float sum = layer_config->bias[i]; |
586 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
587 | 0 | int off = k * layer_config->out_channels + i; |
588 | 0 | for (int ii = h; ii < upper_ii_index; ++ii) { |
589 | 0 | const int clamped_ii = CLAMPINDEX(ii, in_height); |
590 | 0 | for (int jj = w; jj < upper_jj_index; ++jj) { |
591 | 0 | const int clamped_jj = CLAMPINDEX(jj, in_width); |
592 | 0 | assert(clamped_ii >= 0 && clamped_ii < in_height && |
593 | 0 | clamped_jj >= 0 && clamped_jj < in_width); |
594 | 0 | sum += layer_config->weights[off] * |
595 | 0 | input[k][clamped_ii * in_stride + clamped_jj]; |
596 | 0 | off += cstep; |
597 | 0 | } |
598 | 0 | } |
599 | 0 | } |
600 | 0 | output[i][out_index] = sum; |
601 | 0 | } |
602 | 0 | } |
603 | 0 | } |
604 | 0 | } |
605 | | |
606 | | // CNNConvolve specific to maxpool set as 0 and padding equal to |
607 | | // PADDING_VALID. |
608 | | void av1_cnn_convolve_no_maxpool_padding_valid_c( |
609 | | const float **input, int in_width, int in_height, int in_stride, |
610 | | const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, |
611 | 0 | int start_idx, int cstep, int channel_step) { |
612 | 0 | assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || |
613 | 0 | !layer_config->maxpool); |
614 | 0 | assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); |
615 | 0 | assert(layer_config->pad == PADDING_VALID); |
616 | 0 | for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { |
617 | 0 | for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; |
618 | 0 | h += layer_config->skip_height, ++u) { |
619 | 0 | const int out_h = u * out_stride; |
620 | 0 | const int upper_ii_index = layer_config->filter_height + h; |
621 | 0 | for (int w = 0, out_index = out_h; |
622 | 0 | w < in_width - layer_config->filter_width + 1; |
623 | 0 | w += layer_config->skip_width, ++out_index) { |
624 | 0 | const int upper_jj_index = layer_config->filter_width + w; |
625 | 0 | float sum = layer_config->bias[i]; |
626 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
627 | 0 | int off = k * layer_config->out_channels + i; |
628 | 0 | for (int ii = h; ii < upper_ii_index; ++ii) { |
629 | 0 | for (int jj = w; jj < upper_jj_index; ++jj) { |
630 | 0 | assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); |
631 | 0 | sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; |
632 | 0 | off += cstep; |
633 | 0 | } |
634 | 0 | } |
635 | 0 | } |
636 | 0 | output[i][out_index] = sum; |
637 | 0 | } |
638 | 0 | } |
639 | 0 | } |
640 | 0 | } |
641 | | |
642 | | static void av1_cnn_convolve(const float **input, int in_width, int in_height, |
643 | | int in_stride, |
644 | | const CNN_LAYER_CONFIG *layer_config, |
645 | | float **output, int out_stride, int start_idx, |
646 | 0 | int step) { |
647 | 0 | assert(!layer_config->deconvolve); |
648 | 0 | const int cstep = layer_config->in_channels * layer_config->out_channels; |
649 | 0 | const int filter_height_half = layer_config->filter_height >> 1; |
650 | 0 | const int filter_width_half = layer_config->filter_width >> 1; |
651 | 0 | const int channel_step = AOMMAX(step, 1); |
652 | |
|
653 | 0 | if (layer_config->maxpool && |
654 | 0 | (layer_config->skip_height > 1 || layer_config->skip_width > 1)) { |
655 | 0 | switch (layer_config->pad) { |
656 | 0 | case PADDING_SAME_ZERO: |
657 | 0 | convolve_maxpool_padding_zero(input, in_width, in_height, in_stride, |
658 | 0 | layer_config, output, out_stride, cstep, |
659 | 0 | filter_width_half, filter_height_half); |
660 | 0 | break; |
661 | 0 | case PADDING_SAME_REPLICATE: |
662 | 0 | convolve_maxpool_padding_replicate( |
663 | 0 | input, in_width, in_height, in_stride, layer_config, output, |
664 | 0 | out_stride, cstep, filter_width_half, filter_height_half); |
665 | 0 | break; |
666 | 0 | case PADDING_VALID: |
667 | 0 | convolve_maxpool_padding_valid(input, in_width, in_height, in_stride, |
668 | 0 | layer_config, output, out_stride, cstep); |
669 | 0 | break; |
670 | 0 | default: assert(0 && "Unknown padding type"); |
671 | 0 | } |
672 | 0 | } else { |
673 | | // Results in element-wise matrix multiplication. |
674 | 0 | if (layer_config->filter_height == 1 && layer_config->filter_width == 1) { |
675 | 0 | convolve_element_wise(input, in_width, in_height, in_stride, layer_config, |
676 | 0 | output, out_stride, start_idx, step); |
677 | 0 | return; |
678 | 0 | } |
679 | 0 | const int ii_shift = |
680 | 0 | filter_height_half - (layer_config->filter_height - 1) % 2; |
681 | 0 | const int jj_shift = |
682 | 0 | filter_width_half - (layer_config->filter_width - 1) % 2; |
683 | 0 | switch (layer_config->pad) { |
684 | 0 | case PADDING_SAME_ZERO: |
685 | 0 | convolve_no_maxpool_padding_zero( |
686 | 0 | input, in_width, in_height, in_stride, layer_config, output, |
687 | 0 | out_stride, start_idx, cstep, filter_width_half, filter_height_half, |
688 | 0 | ii_shift, jj_shift, channel_step); |
689 | 0 | break; |
690 | 0 | case PADDING_SAME_REPLICATE: |
691 | 0 | convolve_no_maxpool_padding_replicate( |
692 | 0 | input, in_width, in_height, in_stride, layer_config, output, |
693 | 0 | out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step); |
694 | 0 | break; |
695 | 0 | case PADDING_VALID: |
696 | 0 | av1_cnn_convolve_no_maxpool_padding_valid( |
697 | 0 | input, in_width, in_height, in_stride, layer_config, output, |
698 | 0 | out_stride, start_idx, cstep, channel_step); |
699 | 0 | break; |
700 | 0 | default: assert(0 && "Unknown padding type"); |
701 | 0 | } |
702 | 0 | } |
703 | 0 | } |
704 | | |
705 | 0 | static int convolve_layer(void *arg1, void *arg2) { |
706 | 0 | const CONVOLVE_OPS *convolve_ops = arg1; |
707 | 0 | (void)arg2; |
708 | 0 | av1_cnn_convolve( |
709 | 0 | convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height, |
710 | 0 | convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output, |
711 | 0 | convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step); |
712 | 0 | return 1; |
713 | 0 | } |
714 | | |
715 | | static void convolve_layer_mt(const float **input, int in_width, int in_height, |
716 | | int in_stride, |
717 | | const CNN_LAYER_CONFIG *layer_config, |
718 | | const CNN_THREAD_DATA *thread_data, |
719 | 0 | float **output, int out_stride) { |
720 | 0 | const AVxWorkerInterface *const winterface = aom_get_worker_interface(); |
721 | 0 | const int num_workers = thread_data->num_workers; |
722 | |
|
723 | 0 | CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS]; |
724 | 0 | for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { |
725 | 0 | AVxWorker *const worker = &thread_data->workers[th]; |
726 | 0 | winterface->reset(worker); |
727 | |
|
728 | 0 | CONVOLVE_OPS convolve_op = { input, in_width, in_height, |
729 | 0 | in_stride, layer_config, output, |
730 | 0 | out_stride, th, num_workers }; |
731 | 0 | convolve_ops[th] = convolve_op; |
732 | 0 | worker->hook = convolve_layer; |
733 | 0 | worker->data1 = &(convolve_ops[th]); |
734 | 0 | worker->data2 = NULL; |
735 | | |
736 | | // Start convolving. |
737 | 0 | if (th == num_workers - 1) { |
738 | 0 | winterface->execute(worker); |
739 | 0 | } else { |
740 | 0 | winterface->launch(worker); |
741 | 0 | } |
742 | 0 | } |
743 | | |
744 | | // Wait until all workers have finished. |
745 | 0 | for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { |
746 | 0 | winterface->sync(&thread_data->workers[th]); |
747 | 0 | } |
748 | 0 | } |
749 | | |
750 | 0 | static INLINE int get_start_shift_deconvolve(int filt_width, int stride) { |
751 | 0 | const int dif = AOMMAX(filt_width - stride, 0); |
752 | 0 | return dif / 2; |
753 | 0 | } |
754 | | |
755 | | void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, |
756 | | int stride, const float *gamma, const float *beta, |
757 | 0 | const float *mean, const float *std) { |
758 | 0 | assert(gamma && beta && beta && std && "batchnorm has null parameter!"); |
759 | 0 | for (int ch = 0; ch < channels; ch++) { |
760 | 0 | const float ch_gamma = gamma[ch]; |
761 | 0 | const float ch_beta = beta[ch]; |
762 | 0 | const float ch_mean = mean[ch]; |
763 | 0 | const float ch_std = std[ch]; |
764 | 0 | float *image_row = image[ch]; |
765 | |
|
766 | 0 | for (int row = 0; row < height; row++) { |
767 | 0 | for (int col = 0; col < width; col++) { |
768 | 0 | image_row[col] = |
769 | 0 | ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta; |
770 | 0 | } |
771 | 0 | image_row += stride; |
772 | 0 | } |
773 | 0 | } |
774 | 0 | } |
775 | | |
776 | | void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, |
777 | | int in_stride, const CNN_LAYER_CONFIG *layer_config, |
778 | 0 | float **output, int out_stride) { |
779 | 0 | assert(layer_config->deconvolve); |
780 | |
|
781 | 0 | const int cstep = layer_config->in_channels * layer_config->out_channels; |
782 | |
|
783 | 0 | int out_width = 0; |
784 | 0 | int out_height = 0; |
785 | 0 | av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width, |
786 | 0 | &out_height); |
787 | 0 | switch (layer_config->pad) { |
788 | 0 | case PADDING_SAME_ZERO: |
789 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
790 | 0 | for (int u = 0; u < out_height; ++u) { |
791 | 0 | for (int v = 0; v < out_width; ++v) { |
792 | 0 | float sum = layer_config->bias[i]; |
793 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
794 | 0 | int off = k * layer_config->out_channels + i; |
795 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
796 | 0 | const int h = |
797 | 0 | u - l + |
798 | 0 | get_start_shift_deconvolve(layer_config->filter_height, |
799 | 0 | layer_config->skip_height); |
800 | 0 | for (int m = 0; m < layer_config->filter_width; |
801 | 0 | ++m, off += cstep) { |
802 | 0 | const int w = |
803 | 0 | v - m + |
804 | 0 | get_start_shift_deconvolve(layer_config->filter_width, |
805 | 0 | layer_config->skip_width); |
806 | 0 | if ((h % layer_config->skip_height) != 0 || |
807 | 0 | (w % layer_config->skip_width) != 0) |
808 | 0 | continue; |
809 | 0 | const int ii = h / layer_config->skip_height; |
810 | 0 | const int jj = w / layer_config->skip_width; |
811 | 0 | if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) |
812 | 0 | continue; |
813 | 0 | sum += layer_config->weights[off] * |
814 | 0 | input[k][ii * in_stride + jj]; |
815 | 0 | } |
816 | 0 | } |
817 | 0 | } |
818 | 0 | output[i][u * out_stride + v] = sum; |
819 | 0 | } |
820 | 0 | } |
821 | 0 | } |
822 | 0 | break; |
823 | 0 | case PADDING_SAME_REPLICATE: |
824 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
825 | 0 | for (int u = 0; u < out_height; ++u) { |
826 | 0 | for (int v = 0; v < out_width; ++v) { |
827 | 0 | float sum = layer_config->bias[i]; |
828 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
829 | 0 | int off = k * layer_config->out_channels + i; |
830 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
831 | 0 | const int h = |
832 | 0 | u - l + |
833 | 0 | get_start_shift_deconvolve(layer_config->filter_height, |
834 | 0 | layer_config->skip_height); |
835 | 0 | for (int m = 0; m < layer_config->filter_width; |
836 | 0 | ++m, off += cstep) { |
837 | 0 | const int w = |
838 | 0 | v - m + |
839 | 0 | get_start_shift_deconvolve(layer_config->filter_width, |
840 | 0 | layer_config->skip_width); |
841 | 0 | if ((h % layer_config->skip_height) != 0 || |
842 | 0 | (w % layer_config->skip_width) != 0) |
843 | 0 | continue; |
844 | 0 | const int ii = |
845 | 0 | CLAMPINDEX(h / layer_config->skip_height, in_height); |
846 | 0 | const int jj = |
847 | 0 | CLAMPINDEX(w / layer_config->skip_width, in_width); |
848 | 0 | assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); |
849 | 0 | sum += layer_config->weights[off] * |
850 | 0 | input[k][ii * in_stride + jj]; |
851 | 0 | } |
852 | 0 | } |
853 | 0 | } |
854 | 0 | output[i][u * out_stride + v] = sum; |
855 | 0 | } |
856 | 0 | } |
857 | 0 | } |
858 | 0 | break; |
859 | 0 | case PADDING_VALID: |
860 | 0 | for (int i = 0; i < layer_config->out_channels; ++i) { |
861 | 0 | for (int u = 0; u < out_height; ++u) { |
862 | 0 | for (int v = 0; v < out_width; ++v) { |
863 | 0 | float sum = layer_config->bias[i]; |
864 | 0 | for (int k = 0; k < layer_config->in_channels; ++k) { |
865 | 0 | int off = k * layer_config->out_channels + i; |
866 | 0 | for (int l = 0; l < layer_config->filter_height; ++l) { |
867 | 0 | const int h = u - l; |
868 | 0 | for (int m = 0; m < layer_config->filter_width; |
869 | 0 | ++m, off += cstep) { |
870 | 0 | const int w = v - m; |
871 | 0 | if ((h % layer_config->skip_height) != 0 || |
872 | 0 | (w % layer_config->skip_width) != 0) |
873 | 0 | continue; |
874 | 0 | const int ii = h / layer_config->skip_height; |
875 | 0 | const int jj = w / layer_config->skip_width; |
876 | 0 | if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) |
877 | 0 | continue; |
878 | 0 | sum += layer_config->weights[off] * |
879 | 0 | input[k][ii * in_stride + jj]; |
880 | 0 | } |
881 | 0 | } |
882 | 0 | } |
883 | 0 | output[i][u * out_stride + v] = sum; |
884 | 0 | } |
885 | 0 | } |
886 | 0 | } |
887 | 0 | break; |
888 | 0 | default: assert(0 && "Unknown padding type"); |
889 | 0 | } |
890 | 0 | } |
891 | | |
892 | | void av1_cnn_predict_c(const float **input, int in_width, int in_height, |
893 | | int in_stride, const CNN_CONFIG *cnn_config, |
894 | | const CNN_THREAD_DATA *thread_data, |
895 | 0 | CNN_MULTI_OUT *output_struct) { |
896 | 0 | TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } }; |
897 | 0 | TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } }; |
898 | |
|
899 | 0 | float **output[CNN_MAX_BRANCHES]; |
900 | 0 | const int *out_chs = output_struct->output_channels; |
901 | 0 | output[0] = output_struct->output_buffer; |
902 | 0 | for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) { |
903 | 0 | output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1]; |
904 | 0 | } |
905 | |
|
906 | 0 | int i_width = in_width; |
907 | 0 | int i_height = in_height; |
908 | 0 | int o_width = 0, o_height = 0; |
909 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
910 | 0 | init_tensor(&tensor1[b]); |
911 | 0 | init_tensor(&tensor2[b]); |
912 | 0 | } |
913 | |
|
914 | 0 | const int *out_stride = output_struct->output_strides; |
915 | 0 | for (int layer = 0; layer < cnn_config->num_layers; ++layer) { |
916 | 0 | const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; |
917 | 0 | const int branch = layer_config->branch; |
918 | 0 | const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; |
919 | | |
920 | | // Allocate input tensor |
921 | 0 | if (layer == 0) { // First layer |
922 | 0 | assert(branch == 0); // First layer must be primary branch |
923 | 0 | assign_tensor(&tensor1[branch], (float **)input, |
924 | 0 | layer_config->in_channels, in_width, in_height, in_stride); |
925 | 0 | } else { // Non-first layer |
926 | | // Swap tensor1 and tensor2 |
927 | 0 | swap_tensor(&tensor1[branch], &tensor2[branch]); |
928 | |
|
929 | 0 | i_width = tensor1[branch].width; |
930 | 0 | i_height = tensor1[branch].height; |
931 | 0 | } |
932 | | |
933 | | // Allocate output tensor |
934 | 0 | av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width, |
935 | 0 | &o_height); |
936 | 0 | const int output_num = layer_config->output_num; |
937 | 0 | if (output_num == -1) { // Non-output layer |
938 | 0 | realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width, |
939 | 0 | o_height); |
940 | 0 | } else { // Output layer |
941 | 0 | free_tensor(&tensor2[branch]); |
942 | 0 | assign_tensor(&tensor2[branch], output[output_num], |
943 | 0 | layer_config->out_channels, o_width, o_height, |
944 | 0 | out_stride[output_num]); |
945 | 0 | } |
946 | | |
947 | | // If we are combining branches make sure that the branch to combine |
948 | | // is different from the current branch. |
949 | 0 | assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC, |
950 | 0 | !(branch_config->branches_to_combine & (1 << branch)))); |
951 | |
|
952 | 0 | if (layer_config->branch_copy_type == BRANCH_INPUT) { |
953 | 0 | copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch, |
954 | 0 | tensor2); |
955 | 0 | } |
956 | | // Check consistency of input and output channels |
957 | 0 | assert(tensor1[branch].channels == layer_config->in_channels); |
958 | 0 | assert(tensor2[branch].channels == layer_config->out_channels); |
959 | | |
960 | | // Convolve/Deconvolve |
961 | 0 | if (!cnn_config->layer_config[layer].deconvolve) { |
962 | 0 | if (thread_data->num_workers > 1) { |
963 | 0 | convolve_layer_mt((const float **)tensor1[branch].buf, |
964 | 0 | tensor1[branch].width, tensor1[branch].height, |
965 | 0 | tensor1[branch].stride, layer_config, thread_data, |
966 | 0 | tensor2[branch].buf, tensor2[branch].stride); |
967 | 0 | } else { |
968 | 0 | av1_cnn_convolve((const float **)tensor1[branch].buf, |
969 | 0 | tensor1[branch].width, tensor1[branch].height, |
970 | 0 | tensor1[branch].stride, layer_config, |
971 | 0 | tensor2[branch].buf, tensor2[branch].stride, 0, 1); |
972 | 0 | } |
973 | 0 | } else { |
974 | 0 | av1_cnn_deconvolve((const float **)tensor1[branch].buf, |
975 | 0 | tensor1[branch].width, tensor1[branch].height, |
976 | 0 | tensor1[branch].stride, layer_config, |
977 | 0 | tensor2[branch].buf, tensor2[branch].stride); |
978 | 0 | } |
979 | |
|
980 | 0 | if (layer_config->branch_copy_type == BRANCH_OUTPUT) { |
981 | 0 | copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, |
982 | 0 | tensor2); |
983 | 0 | } |
984 | | |
985 | | // Add tensors from other branches if needed |
986 | 0 | if (layer_config->branch_combine_type == BRANCH_ADD) { |
987 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
988 | 0 | if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { |
989 | 0 | assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch])); |
990 | 0 | av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels, |
991 | 0 | tensor2[branch].width, tensor2[branch].height, |
992 | 0 | tensor2[branch].stride, (const float **)tensor2[b].buf); |
993 | 0 | } |
994 | 0 | } |
995 | 0 | } |
996 | | |
997 | | // Non-linearity |
998 | 0 | if (layer_config->activation != IDENTITY) |
999 | 0 | av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, |
1000 | 0 | tensor2[branch].width, tensor2[branch].height, |
1001 | 0 | tensor2[branch].stride, layer_config->activation); |
1002 | |
|
1003 | 0 | if (layer_config->bn_params.bn_gamma) { |
1004 | 0 | av1_cnn_batchnorm( |
1005 | 0 | tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, |
1006 | 0 | tensor2[branch].height, tensor2[branch].stride, |
1007 | 0 | layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta, |
1008 | 0 | layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std); |
1009 | 0 | } |
1010 | | |
1011 | | // Concatenate tensors |
1012 | 0 | if (layer_config->branch_combine_type == BRANCH_CAT) { |
1013 | 0 | if (output_num == -1) { // Non-output layer |
1014 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
1015 | 0 | if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { |
1016 | 0 | assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); |
1017 | 0 | assert(tensor2[b].channels > 0); |
1018 | 0 | concat_tensor(&tensor2[b], &tensor2[branch]); |
1019 | 0 | } |
1020 | 0 | } |
1021 | 0 | } else { // Output layer |
1022 | 0 | const int existing_channels = tensor2[branch].channels; |
1023 | 0 | int num_chs = existing_channels; |
1024 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
1025 | 0 | if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { |
1026 | 0 | assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); |
1027 | | // Needed only to assign the new channel buffers |
1028 | 0 | num_chs += tensor2[b].channels; |
1029 | 0 | } |
1030 | 0 | } |
1031 | 0 | assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width, |
1032 | 0 | o_height, out_stride[output_num]); |
1033 | |
|
1034 | 0 | num_chs = existing_channels; |
1035 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
1036 | 0 | if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { |
1037 | 0 | assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); |
1038 | | // Needed only to assign the new channel buffers |
1039 | 0 | copy_tensor(&tensor2[b], tensor2[b].channels, num_chs, |
1040 | 0 | &tensor2[branch]); |
1041 | 0 | num_chs += tensor2[b].channels; |
1042 | 0 | } |
1043 | 0 | } |
1044 | 0 | } |
1045 | 0 | } |
1046 | |
|
1047 | 0 | if (layer_config->branch_copy_type == BRANCH_COMBINED) { |
1048 | 0 | copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, |
1049 | 0 | tensor2); |
1050 | 0 | } |
1051 | 0 | } |
1052 | |
|
1053 | 0 | for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { |
1054 | 0 | free_tensor(&tensor1[b]); |
1055 | 0 | free_tensor(&tensor2[b]); |
1056 | 0 | } |
1057 | 0 | } |
1058 | | |
1059 | | // Assume output already has proper allocation |
1060 | | // Assume input image buffers all have same resolution and strides |
1061 | | void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, |
1062 | | int stride, const CNN_CONFIG *cnn_config, |
1063 | | const CNN_THREAD_DATA *thread_data, |
1064 | 0 | CNN_MULTI_OUT *output) { |
1065 | 0 | const float max_val = 255.0; |
1066 | |
|
1067 | 0 | const int in_width = width + 2 * cnn_config->ext_width; |
1068 | 0 | const int in_height = height + 2 * cnn_config->ext_height; |
1069 | 0 | const int in_channels = cnn_config->layer_config[0].in_channels; |
1070 | 0 | float *inputs[CNN_MAX_CHANNELS]; |
1071 | 0 | float *input_ = |
1072 | 0 | (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); |
1073 | 0 | const int in_stride = in_width; |
1074 | |
|
1075 | 0 | for (int c = 0; c < in_channels; ++c) { |
1076 | 0 | inputs[c] = input_ + c * in_stride * in_height; |
1077 | 0 | float *input = |
1078 | 0 | inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; |
1079 | |
|
1080 | 0 | if (cnn_config->strict_bounds) { |
1081 | 0 | for (int i = 0; i < height; ++i) |
1082 | 0 | for (int j = 0; j < width; ++j) |
1083 | 0 | input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; |
1084 | | // extend left and right |
1085 | 0 | for (int i = 0; i < height; ++i) { |
1086 | 0 | for (int j = -cnn_config->ext_width; j < 0; ++j) |
1087 | 0 | input[i * in_stride + j] = input[i * in_stride]; |
1088 | 0 | for (int j = width; j < width + cnn_config->ext_width; ++j) |
1089 | 0 | input[i * in_stride + j] = input[i * in_stride + width - 1]; |
1090 | 0 | } |
1091 | | // extend top and bottom |
1092 | 0 | for (int i = -cnn_config->ext_height; i < 0; ++i) |
1093 | 0 | memcpy(&input[i * in_stride - cnn_config->ext_width], |
1094 | 0 | &input[-cnn_config->ext_width], in_width * sizeof(*input)); |
1095 | 0 | for (int i = height; i < height + cnn_config->ext_height; ++i) |
1096 | 0 | memcpy(&input[i * in_stride - cnn_config->ext_width], |
1097 | 0 | &input[(height - 1) * in_stride - cnn_config->ext_width], |
1098 | 0 | in_width * sizeof(*input)); |
1099 | 0 | } else { |
1100 | 0 | for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; |
1101 | 0 | ++i) |
1102 | 0 | for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; |
1103 | 0 | ++j) |
1104 | 0 | input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; |
1105 | 0 | } |
1106 | 0 | } |
1107 | 0 | av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, |
1108 | 0 | cnn_config, thread_data, output); |
1109 | |
|
1110 | 0 | aom_free(input_); |
1111 | 0 | } |
1112 | | |
1113 | | // Assume output already has proper allocation |
1114 | | // Assume input image buffers all have same resolution and strides |
1115 | | void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, |
1116 | | int stride, |
1117 | | const CNN_CONFIG *cnn_config, |
1118 | | const CNN_THREAD_DATA *thread_data, |
1119 | | int bit_depth, |
1120 | 0 | CNN_MULTI_OUT *output) { |
1121 | 0 | const float max_val = (float)((1 << bit_depth) - 1); |
1122 | |
|
1123 | 0 | const int in_width = width + 2 * cnn_config->ext_width; |
1124 | 0 | const int in_height = height + 2 * cnn_config->ext_height; |
1125 | 0 | const int in_channels = cnn_config->layer_config[0].in_channels; |
1126 | 0 | float *inputs[CNN_MAX_CHANNELS]; |
1127 | 0 | float *input_ = |
1128 | 0 | (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); |
1129 | 0 | const int in_stride = in_width; |
1130 | |
|
1131 | 0 | for (int c = 0; c < in_channels; ++c) { |
1132 | 0 | inputs[c] = input_ + c * in_stride * in_height; |
1133 | 0 | float *input = |
1134 | 0 | inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; |
1135 | |
|
1136 | 0 | if (cnn_config->strict_bounds) { |
1137 | 0 | for (int i = 0; i < height; ++i) |
1138 | 0 | for (int j = 0; j < width; ++j) |
1139 | 0 | input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; |
1140 | | // extend left and right |
1141 | 0 | for (int i = 0; i < height; ++i) { |
1142 | 0 | for (int j = -cnn_config->ext_width; j < 0; ++j) |
1143 | 0 | input[i * in_stride + j] = input[i * in_stride]; |
1144 | 0 | for (int j = width; j < width + cnn_config->ext_width; ++j) |
1145 | 0 | input[i * in_stride + j] = input[i * in_stride + width - 1]; |
1146 | 0 | } |
1147 | | // extend top and bottom |
1148 | 0 | for (int i = -cnn_config->ext_height; i < 0; ++i) |
1149 | 0 | memcpy(&input[i * in_stride - cnn_config->ext_width], |
1150 | 0 | &input[-cnn_config->ext_width], in_width * sizeof(*input)); |
1151 | 0 | for (int i = height; i < height + cnn_config->ext_height; ++i) |
1152 | 0 | memcpy(&input[i * in_stride - cnn_config->ext_width], |
1153 | 0 | &input[(height - 1) * in_stride - cnn_config->ext_width], |
1154 | 0 | in_width * sizeof(*input)); |
1155 | 0 | } else { |
1156 | 0 | for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; |
1157 | 0 | ++i) |
1158 | 0 | for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; |
1159 | 0 | ++j) |
1160 | 0 | input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; |
1161 | 0 | } |
1162 | 0 | } |
1163 | |
|
1164 | 0 | av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, |
1165 | 0 | cnn_config, thread_data, output); |
1166 | |
|
1167 | 0 | aom_free(input_); |
1168 | 0 | } |
1169 | | |
1170 | | // Assume output already has proper allocation |
1171 | | // Assume input image buffers all have same resolution and strides |
1172 | | void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride, |
1173 | | const CNN_CONFIG *cnn_config, |
1174 | | const CNN_THREAD_DATA *thread_data, float **output, |
1175 | 0 | int out_stride) { |
1176 | 0 | int out_width = 0, out_height = 0, out_channels = 0; |
1177 | 0 | av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height, |
1178 | 0 | &out_channels); |
1179 | 0 | const int output_chs[1] = { out_channels }; |
1180 | 0 | const int output_strides[1] = { out_stride }; |
1181 | 0 | CNN_MULTI_OUT output_struct = { .output_channels = output_chs, |
1182 | 0 | .output_strides = output_strides, |
1183 | 0 | .output_buffer = output }; |
1184 | 0 | av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config, |
1185 | 0 | thread_data, &output_struct); |
1186 | 0 | } |
1187 | | |
1188 | | // Assume output already has proper allocation |
1189 | | // Assume input image buffers all have same resolution and strides |
1190 | | void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height, |
1191 | | int stride, const CNN_CONFIG *cnn_config, |
1192 | | const CNN_THREAD_DATA *thread_data, |
1193 | 0 | int bit_depth, float **output, int out_stride) { |
1194 | 0 | int out_width = 0, out_height = 0, out_channels = 0; |
1195 | 0 | av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height, |
1196 | 0 | &out_channels); |
1197 | 0 | const int output_chs[1] = { out_channels }; |
1198 | 0 | const int output_strides[1] = { out_stride }; |
1199 | 0 | CNN_MULTI_OUT output_struct = { .output_channels = output_chs, |
1200 | 0 | .output_strides = output_strides, |
1201 | 0 | .output_buffer = output }; |
1202 | 0 | av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config, |
1203 | 0 | thread_data, bit_depth, &output_struct); |
1204 | 0 | } |