Coverage Report

Created: 2025-06-22 08:04

/src/aom/av1/encoder/cnn.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <math.h>
14
#include <stdbool.h>
15
16
#include "aom_dsp/aom_dsp_common.h"
17
#include "av1/common/av1_common_int.h"
18
#include "av1/encoder/cnn.h"
19
20
0
#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
21
22
typedef struct {
23
  const float **input;
24
  int in_width;
25
  int in_height;
26
  int in_stride;
27
  const CNN_LAYER_CONFIG *layer_config;
28
  float **output;
29
  int out_stride;
30
  int start_idx;
31
  int th_step;
32
} CONVOLVE_OPS;
33
34
0
static inline float softsign(float x) { return x / (fabsf(x) + 1.0f); }
35
36
0
static inline float relu(float x) { return (x < 0) ? 0 : x; }
37
38
typedef struct {
39
  int allocsize;
40
  int channels;
41
  int width, height, stride;
42
  float *buf[CNN_MAX_CHANNELS];
43
} TENSOR;
44
45
0
static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
46
47
0
static void free_tensor(TENSOR *tensor) {
48
0
  if (tensor->allocsize) {
49
0
    aom_free(tensor->buf[0]);
50
0
    tensor->buf[0] = NULL;
51
0
    tensor->allocsize = 0;
52
0
  }
53
0
}
54
55
static bool realloc_tensor(TENSOR *tensor, int channels, int width,
56
0
                           int height) {
57
0
  const int newallocsize = channels * width * height;
58
0
  if (tensor->allocsize < newallocsize) {
59
0
    free_tensor(tensor);
60
0
    tensor->buf[0] =
61
0
        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
62
0
    if (!tensor->buf[0]) return false;
63
0
    tensor->allocsize = newallocsize;
64
0
  }
65
0
  tensor->width = width;
66
0
  tensor->height = height;
67
0
  tensor->stride = width;
68
0
  tensor->channels = channels;
69
0
  for (int c = 1; c < channels; ++c)
70
0
    tensor->buf[c] = &tensor->buf[0][c * width * height];
71
0
  return true;
72
0
}
73
74
static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
75
0
                        TENSOR *dst) {
76
0
  assert(src->width == dst->width);
77
0
  assert(src->height == dst->height);
78
0
  assert(copy_channels <= src->channels);
79
0
  if (src->stride == dst->width && dst->stride == dst->width) {
80
0
    for (int c = 0; c < copy_channels; ++c) {
81
0
      memcpy(dst->buf[dst_offset + c], src->buf[c],
82
0
             sizeof(*dst->buf[0]) * src->width * src->height);
83
0
    }
84
0
  } else {
85
0
    for (int c = 0; c < copy_channels; ++c) {
86
0
      for (int r = 0; r < dst->height; ++r) {
87
0
        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
88
0
               &src->buf[c][r * src->stride],
89
0
               dst->width * sizeof(*dst->buf[c]));
90
0
      }
91
0
    }
92
0
  }
93
0
}
94
95
static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
96
0
                          int channels, int width, int height, int stride) {
97
0
  tensor->allocsize = 0;
98
0
  tensor->channels = channels;
99
0
  tensor->width = width;
100
0
  tensor->height = height;
101
0
  tensor->stride = stride;
102
0
  if (buf) {
103
0
    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
104
0
  } else {
105
0
    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
106
0
  }
107
0
}
108
109
0
static void swap_tensor(TENSOR *t1, TENSOR *t2) {
110
0
  TENSOR t = *t1;
111
0
  *t1 = *t2;
112
0
  *t2 = t;
113
0
}
114
115
// The concatenated tensor goes into dst with first the channels in
116
// original dst followed by the channels in the src
117
0
static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
118
0
  assert(src->width == dst->width);
119
0
  assert(src->height == dst->height);
120
121
0
  const int dst_channels = dst->channels;
122
0
  const int channels = dst->channels + src->channels;
123
0
  const int newallocsize = channels * dst->width * dst->height;
124
0
  if (dst->allocsize < newallocsize) {
125
0
    TENSOR t;
126
0
    init_tensor(&t);
127
    // allocate new buffers and copy first the dst channels
128
0
    if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
129
0
    copy_tensor(dst, dst->channels, 0, &t);
130
    // Swap the tensors and free the old buffers
131
0
    swap_tensor(dst, &t);
132
0
    free_tensor(&t);
133
0
  }
134
0
  for (int c = 1; c < channels; ++c)
135
0
    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
136
  // Copy the channels in src after the first dst_channels channels.
137
0
  copy_tensor(src, src->channels, dst_channels, dst);
138
0
  return true;
139
0
}
140
141
#ifndef NDEBUG
142
static int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
143
  return (t1->width == t2->width && t1->height == t2->height);
144
}
145
146
static int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
147
  return (t1->channels == t2->channels && t1->width == t2->width &&
148
          t1->height == t2->height);
149
}
150
#endif  // NDEBUG
151
152
void av1_find_cnn_layer_output_size(int in_width, int in_height,
153
                                    const CNN_LAYER_CONFIG *layer_config,
154
0
                                    int *out_width, int *out_height) {
155
0
  assert(layer_config->skip_width > 0);
156
0
  assert(layer_config->skip_height > 0);
157
0
  if (!layer_config->deconvolve) {
158
0
    switch (layer_config->pad) {
159
0
      case PADDING_SAME_ZERO:
160
0
      case PADDING_SAME_REPLICATE:
161
0
        *out_width = (in_width + layer_config->skip_width - 1) /
162
0
                     layer_config->skip_width;
163
0
        *out_height = (in_height + layer_config->skip_height - 1) /
164
0
                      layer_config->skip_height;
165
0
        break;
166
0
      case PADDING_VALID:
167
0
        *out_width =
168
0
            (in_width - layer_config->filter_width + layer_config->skip_width) /
169
0
            layer_config->skip_width;
170
0
        *out_height = (in_height - layer_config->filter_height +
171
0
                       layer_config->skip_height) /
172
0
                      layer_config->skip_height;
173
0
        break;
174
0
      default: assert(0 && "Unknown padding type");
175
0
    }
176
0
  } else {
177
0
    switch (layer_config->pad) {
178
0
      case PADDING_SAME_ZERO:
179
0
      case PADDING_SAME_REPLICATE:
180
0
        *out_width = in_width * layer_config->skip_width;
181
0
        *out_height = in_height * layer_config->skip_height;
182
0
        break;
183
0
      case PADDING_VALID:
184
0
        *out_width = (in_width - 1) * layer_config->skip_width +
185
0
                     layer_config->filter_width;
186
0
        *out_height = (in_height - 1) * layer_config->skip_height +
187
0
                      layer_config->filter_height;
188
0
        break;
189
0
      default: assert(0 && "Unknown padding type");
190
0
    }
191
0
  }
192
0
}
193
194
static void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
195
0
                                  int channels_per_branch[]) {
196
0
  int branch = layer_config->branch;
197
0
  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
198
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
199
0
    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
200
0
      if (layer_config->branch_copy_type == BRANCH_INPUT) {
201
0
        channels_per_branch[b] = layer_config->in_channels;
202
0
      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
203
0
        channels_per_branch[b] = layer_config->out_channels;
204
0
      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
205
0
        channels_per_branch[b] = layer_config->out_channels;
206
0
        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
207
0
          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
208
0
            assert(channels_per_branch[c] > 0);
209
0
            channels_per_branch[b] += channels_per_branch[c];
210
0
          }
211
0
        }
212
0
      }
213
0
    }
214
0
  }
215
0
  channels_per_branch[branch] = layer_config->out_channels;
216
0
  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
217
0
    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
218
0
      assert(channels_per_branch[c] > 0);
219
0
      channels_per_branch[branch] += channels_per_branch[c];
220
0
    }
221
0
  }
222
0
}
223
224
#if CONFIG_DEBUG
225
static inline int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
226
  const int num_layers = cnn_config->num_layers;
227
  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
228
229
  for (int idx = 0; idx < num_layers; idx++) {
230
    if (layer_configs[idx].output_num != -1) {
231
      return 1;
232
    }
233
  }
234
  return 0;
235
}
236
#endif
237
238
void av1_find_cnn_output_size(int in_width, int in_height,
239
                              const CNN_CONFIG *cnn_config, int *out_width,
240
0
                              int *out_height, int *out_channels) {
241
0
  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
242
0
  int i_width[CNN_MAX_BRANCHES] = { 0 };
243
0
  int i_height[CNN_MAX_BRANCHES] = { 0 };
244
0
  i_width[0] = in_width + cnn_config->ext_width * 2;
245
0
  i_height[0] = in_height + cnn_config->ext_height * 2;
246
247
#if CONFIG_DEBUG
248
  assert(cnn_has_at_least_one_output(cnn_config));
249
#endif
250
251
0
  for (int i = 0; i < cnn_config->num_layers; ++i) {
252
0
    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
253
0
    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
254
0
    const int branch = layer_config->branch;
255
0
    int o_width = 0, o_height = 0;
256
257
0
    if (layer_config->branch_copy_type == BRANCH_INPUT) {
258
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
259
0
        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
260
0
          assert(i_width[branch] > 0 && i_height[branch] > 0);
261
0
          i_width[b] = i_width[branch];
262
0
          i_height[b] = i_height[branch];
263
0
        }
264
0
      }
265
0
    }
266
267
0
    av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
268
0
                                   layer_config, &o_width, &o_height);
269
0
    i_width[branch] = o_width;
270
0
    i_height[branch] = o_height;
271
272
0
    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
273
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
274
0
        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
275
0
          i_width[b] = o_width;
276
0
          i_height[b] = o_height;
277
0
        }
278
0
      }
279
0
    }
280
281
0
    find_cnn_out_channels(layer_config, channels_per_branch);
282
283
0
    const int output_num = layer_config->output_num;
284
0
    if (output_num != -1) {  // Current layer is an output layer
285
0
      out_width[output_num] = o_width;
286
0
      out_height[output_num] = o_height;
287
0
      out_channels[output_num] = channels_per_branch[layer_config->branch];
288
0
    }
289
0
  }
290
0
}
291
292
static inline int get_start_shift_convolve(int width, int filt_width,
293
0
                                           int stride) {
294
0
  const int mod = (width % stride);
295
0
  const int filt_off = (filt_width - 1) / 2;
296
0
  const int dif = (mod ? mod - 1 : stride - 1);
297
0
  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
298
0
}
299
300
void av1_cnn_add_c(float **output, int channels, int width, int height,
301
0
                   int stride, const float **add) {
302
0
  for (int c = 0; c < channels; ++c) {
303
0
    for (int i = 0; i < height; ++i)
304
0
      for (int j = 0; j < width; ++j)
305
0
        output[c][i * stride + j] += add[c][i * stride + j];
306
0
  }
307
0
}
308
309
void av1_cnn_activate_c(float **output, int channels, int width, int height,
310
0
                        int stride, ACTIVATION layer_activation) {
311
0
  if (layer_activation == RELU) {
312
0
    for (int c = 0; c < channels; ++c) {
313
0
      for (int i = 0; i < height; ++i)
314
0
        for (int j = 0; j < width; ++j)
315
0
          output[c][i * stride + j] = relu(output[c][i * stride + j]);
316
0
    }
317
0
  } else if (layer_activation == SOFTSIGN) {
318
0
    for (int c = 0; c < channels; ++c) {
319
0
      for (int i = 0; i < height; ++i)
320
0
        for (int j = 0; j < width; ++j)
321
0
          output[c][i * stride + j] = softsign(output[c][i * stride + j]);
322
0
    }
323
0
  } else if (layer_activation == SIGMOID) {
324
0
    assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
325
0
  } else if (layer_activation != NONE) {
326
0
    assert(0 && "Unknown activation type");
327
0
  }
328
0
}
329
330
static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
331
                                           const CNN_LAYER_CONFIG *layer_config,
332
0
                                           int branch, TENSOR branch_output[]) {
333
0
  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
334
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
335
0
    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
336
      // Copy layer's active tensor to output tensor of branch b if set in
337
      // mask. The output becomes the input of the first layer of the branch
338
      // because the layer of the branch is not the first layer.
339
0
      int copy_channels = branch_config->channels_to_copy > 0
340
0
                              ? branch_config->channels_to_copy
341
0
                              : layer_active_tensor->channels;
342
0
      if (!realloc_tensor(&branch_output[b], copy_channels,
343
0
                          layer_active_tensor->width,
344
0
                          layer_active_tensor->height)) {
345
0
        return false;
346
0
      }
347
0
      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
348
0
    }
349
0
  }
350
0
  return true;
351
0
}
352
353
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
354
// greater than 1 and padding equal to PADDING_SAME_ZERO.
355
static void convolve_maxpool_padding_zero(
356
    const float **input, int in_width, int in_height, int in_stride,
357
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
358
    const int cstep, const int filter_width_half,
359
0
    const int filter_height_half) {
360
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
361
0
    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
362
0
      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
363
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
364
0
             ++hh) {
365
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
366
0
               ++ww) {
367
0
            float sum = layer_config->bias[i];
368
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
369
0
              int off = k * layer_config->out_channels + i;
370
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
371
0
                const int ii = hh + l - filter_height_half;
372
0
                for (int m = 0; m < layer_config->filter_width;
373
0
                     ++m, off += cstep) {
374
0
                  const int jj = ww + m - filter_width_half;
375
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
376
0
                    continue;
377
0
                  sum += layer_config->weights[off] *
378
0
                         input[k][ii * in_stride + jj];
379
0
                }
380
0
              }
381
0
            }
382
0
            const float a = sum;
383
0
            if (h == hh && w == ww)
384
0
              output[i][u * out_stride + v] = a;
385
0
            else
386
0
              output[i][u * out_stride + v] =
387
0
                  AOMMAX(output[i][u * out_stride + v], a);
388
0
          }
389
0
        }
390
0
      }
391
0
    }
392
0
  }
393
0
}
394
395
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
396
// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
397
static void convolve_maxpool_padding_replicate(
398
    const float **input, int in_width, int in_height, int in_stride,
399
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
400
    const int cstep, const int filter_width_half,
401
0
    const int filter_height_half) {
402
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
403
0
    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
404
0
      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
405
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
406
0
             ++hh) {
407
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
408
0
               ++ww) {
409
0
            float sum = layer_config->bias[i];
410
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
411
0
              int off = k * layer_config->out_channels + i;
412
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
413
0
                const int ii =
414
0
                    CLAMPINDEX(hh + l - filter_height_half, in_height);
415
0
                for (int m = 0; m < layer_config->filter_width;
416
0
                     ++m, off += cstep) {
417
0
                  const int jj =
418
0
                      CLAMPINDEX(ww + m - filter_width_half, in_width);
419
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
420
0
                  sum += layer_config->weights[off] *
421
0
                         input[k][ii * in_stride + jj];
422
0
                }
423
0
              }
424
0
            }
425
0
            const float a = sum;
426
0
            if (h == hh && w == ww)
427
0
              output[i][u * out_stride + v] = a;
428
0
            else
429
0
              output[i][u * out_stride + v] =
430
0
                  AOMMAX(output[i][u * out_stride + v], a);
431
0
          }
432
0
        }
433
0
      }
434
0
    }
435
0
  }
436
0
}
437
438
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
439
// greater than 1 and padding equal to PADDING_VALID.
440
static void convolve_maxpool_padding_valid(
441
    const float **input, int in_width, int in_height, int in_stride,
442
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
443
0
    const int cstep) {
444
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
445
0
    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
446
0
         h += layer_config->skip_height, ++u) {
447
0
      for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
448
0
           w += layer_config->skip_width, ++v) {
449
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
450
0
             ++hh) {
451
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
452
0
               ++ww) {
453
0
            float sum = layer_config->bias[i];
454
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
455
0
              int off = k * layer_config->out_channels + i;
456
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
457
0
                const int ii = hh + l;
458
0
                for (int m = 0; m < layer_config->filter_width;
459
0
                     ++m, off += cstep) {
460
0
                  const int jj = ww + m;
461
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
462
0
                  sum += layer_config->weights[off] *
463
0
                         input[k][ii * in_stride + jj];
464
0
                }
465
0
              }
466
0
            }
467
0
            const float a = sum;
468
0
            if (h == hh && w == ww)
469
0
              output[i][u * out_stride + v] = a;
470
0
            else
471
0
              output[i][u * out_stride + v] =
472
0
                  AOMMAX(output[i][u * out_stride + v], a);
473
0
          }
474
0
        }
475
0
      }
476
0
    }
477
0
  }
478
0
}
479
480
// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
481
// equal to 1.
482
static void convolve_element_wise(const float **input, int in_width,
483
                                  int in_height, int in_stride,
484
                                  const CNN_LAYER_CONFIG *const layer_config,
485
                                  float **output, int out_stride, int start_idx,
486
0
                                  int step) {
487
0
  const int start_h = get_start_shift_convolve(
488
0
      in_height, layer_config->filter_height, layer_config->skip_height);
489
0
  const int start_w =
490
0
      get_start_shift_convolve(in_width, layer_config->filter_width,
491
0
                               layer_config->skip_width) +
492
0
      start_idx * layer_config->skip_width;
493
0
  const int out_w_step = AOMMAX(step, 1);
494
0
  const int in_w_step = layer_config->skip_width * out_w_step;
495
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
496
0
    for (int h = start_h, u = 0; h < in_height;
497
0
         h += layer_config->skip_height, ++u) {
498
0
      const int in_h = h * in_stride;
499
0
      const int out_h = u * out_stride + start_idx;
500
0
      for (int w = start_w, out_index = out_h; w < in_width;
501
0
           w += in_w_step, out_index += out_w_step) {
502
0
        float sum = layer_config->bias[i];
503
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
504
0
          sum += layer_config->weights[k * layer_config->out_channels + i] *
505
0
                 input[k][in_h + w];
506
0
        }
507
0
        output[i][out_index] = sum;
508
0
      }
509
0
    }
510
0
  }
511
0
}
512
513
// CNNConvolve specific to maxpool set as 0 and padding equal to
514
// PADDING_SAME_ZERO.
515
static void convolve_no_maxpool_padding_zero(
516
    const float **input, int in_width, int in_height, int in_stride,
517
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
518
    int start_idx, const int cstep, const int filter_width_half,
519
    const int filter_height_half, const int ii_shift, const int jj_shift,
520
0
    const int channel_step) {
521
0
  const int start_h = get_start_shift_convolve(
522
0
      in_height, layer_config->filter_height, layer_config->skip_height);
523
0
  const int start_w = get_start_shift_convolve(
524
0
      in_width, layer_config->filter_width, layer_config->skip_width);
525
0
  const int end_ii_shift = filter_height_half + 1;
526
0
  const int end_jj_shift = filter_width_half + 1;
527
  // *_filter_margin stores the number of pixels along a dimension in the
528
  // intersection of the complement of the image in the extended image
529
  // and the filter.
530
0
  const int top_filter_margin = layer_config->filter_width * ii_shift;
531
0
  const int right_filter_margin = end_jj_shift - in_width;
532
0
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
533
0
    for (int h = start_h, u = 0; h < in_height;
534
0
         h += layer_config->skip_height, ++u) {
535
0
      const int out_h = u * out_stride;
536
0
      const int top_cstep =
537
0
          AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
538
0
              cstep +
539
0
          i;
540
0
      const int start_ii = AOMMAX(0, h - ii_shift);
541
0
      const int end_ii = AOMMIN(in_height, h + end_ii_shift);
542
0
      for (int w = start_w, out_index = out_h; w < in_width;
543
0
           w += layer_config->skip_width, ++out_index) {
544
0
        const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
545
0
        const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
546
0
        const int start_jj = AOMMAX(0, w - jj_shift);
547
0
        const int end_jj = AOMMIN(in_width, w + end_jj_shift);
548
0
        float sum = layer_config->bias[i];
549
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
550
0
          int off = k * layer_config->out_channels + top_cstep;
551
0
          for (int ii = start_ii; ii < end_ii; ++ii) {
552
0
            off += left_cstep;
553
0
            for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
554
0
              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
555
0
            }
556
0
            off += right_cstep;
557
0
          }
558
0
        }
559
0
        output[i][out_index] = sum;
560
0
      }
561
0
    }
562
0
  }
563
0
}
564
565
// CNNConvolve specific to maxpool set as 0 and padding equal to
566
// PADDING_SAME_REPLICATE.
567
static void convolve_no_maxpool_padding_replicate(
568
    const float **input, int in_width, int in_height, int in_stride,
569
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
570
    int start_idx, const int cstep, const int ii_shift, const int jj_shift,
571
0
    const int channel_step) {
572
  // h and w are shifted to an offset coordinate system to reduce in-loop
573
  // computation.
574
0
  const int start_h =
575
0
      get_start_shift_convolve(in_height, layer_config->filter_height,
576
0
                               layer_config->skip_height) -
577
0
      ii_shift;
578
0
  const int start_w =
579
0
      get_start_shift_convolve(in_width, layer_config->filter_width,
580
0
                               layer_config->skip_width) -
581
0
      jj_shift;
582
0
  const int end_h = in_height - ii_shift;
583
0
  const int end_w = in_width - jj_shift;
584
0
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
585
0
    for (int h = start_h, u = 0; h < end_h;
586
0
         h += layer_config->skip_height, ++u) {
587
0
      const int out_h = u * out_stride;
588
0
      const int upper_ii_index = layer_config->filter_height + h;
589
0
      for (int w = start_w, out_index = out_h; w < end_w;
590
0
           w += layer_config->skip_width, ++out_index) {
591
0
        const int upper_jj_index = layer_config->filter_width + w;
592
0
        float sum = layer_config->bias[i];
593
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
594
0
          int off = k * layer_config->out_channels + i;
595
0
          for (int ii = h; ii < upper_ii_index; ++ii) {
596
0
            const int clamped_ii = CLAMPINDEX(ii, in_height);
597
0
            for (int jj = w; jj < upper_jj_index; ++jj) {
598
0
              const int clamped_jj = CLAMPINDEX(jj, in_width);
599
0
              assert(clamped_ii >= 0 && clamped_ii < in_height &&
600
0
                     clamped_jj >= 0 && clamped_jj < in_width);
601
0
              sum += layer_config->weights[off] *
602
0
                     input[k][clamped_ii * in_stride + clamped_jj];
603
0
              off += cstep;
604
0
            }
605
0
          }
606
0
        }
607
0
        output[i][out_index] = sum;
608
0
      }
609
0
    }
610
0
  }
611
0
}
612
613
// CNNConvolve specific to maxpool set as 0 and padding equal to
614
// PADDING_VALID.
615
void av1_cnn_convolve_no_maxpool_padding_valid_c(
616
    const float **input, int in_width, int in_height, int in_stride,
617
    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
618
0
    int start_idx, int cstep, int channel_step) {
619
0
  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
620
0
         !layer_config->maxpool);
621
0
  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
622
0
  assert(layer_config->pad == PADDING_VALID);
623
0
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
624
0
    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
625
0
         h += layer_config->skip_height, ++u) {
626
0
      const int out_h = u * out_stride;
627
0
      const int upper_ii_index = layer_config->filter_height + h;
628
0
      for (int w = 0, out_index = out_h;
629
0
           w < in_width - layer_config->filter_width + 1;
630
0
           w += layer_config->skip_width, ++out_index) {
631
0
        const int upper_jj_index = layer_config->filter_width + w;
632
0
        float sum = layer_config->bias[i];
633
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
634
0
          int off = k * layer_config->out_channels + i;
635
0
          for (int ii = h; ii < upper_ii_index; ++ii) {
636
0
            for (int jj = w; jj < upper_jj_index; ++jj) {
637
0
              assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
638
0
              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
639
0
              off += cstep;
640
0
            }
641
0
          }
642
0
        }
643
0
        output[i][out_index] = sum;
644
0
      }
645
0
    }
646
0
  }
647
0
}
648
649
static void av1_cnn_convolve(const float **input, int in_width, int in_height,
650
                             int in_stride,
651
                             const CNN_LAYER_CONFIG *layer_config,
652
                             float **output, int out_stride, int start_idx,
653
0
                             int step) {
654
0
  assert(!layer_config->deconvolve);
655
0
  const int cstep = layer_config->in_channels * layer_config->out_channels;
656
0
  const int filter_height_half = layer_config->filter_height >> 1;
657
0
  const int filter_width_half = layer_config->filter_width >> 1;
658
0
  const int channel_step = AOMMAX(step, 1);
659
660
0
  if (layer_config->maxpool &&
661
0
      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
662
0
    switch (layer_config->pad) {
663
0
      case PADDING_SAME_ZERO:
664
0
        convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
665
0
                                      layer_config, output, out_stride, cstep,
666
0
                                      filter_width_half, filter_height_half);
667
0
        break;
668
0
      case PADDING_SAME_REPLICATE:
669
0
        convolve_maxpool_padding_replicate(
670
0
            input, in_width, in_height, in_stride, layer_config, output,
671
0
            out_stride, cstep, filter_width_half, filter_height_half);
672
0
        break;
673
0
      case PADDING_VALID:
674
0
        convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
675
0
                                       layer_config, output, out_stride, cstep);
676
0
        break;
677
0
      default: assert(0 && "Unknown padding type");
678
0
    }
679
0
  } else {
680
    // Results in element-wise matrix multiplication.
681
0
    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
682
0
      convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
683
0
                            output, out_stride, start_idx, step);
684
0
      return;
685
0
    }
686
0
    const int ii_shift =
687
0
        filter_height_half - (layer_config->filter_height - 1) % 2;
688
0
    const int jj_shift =
689
0
        filter_width_half - (layer_config->filter_width - 1) % 2;
690
0
    switch (layer_config->pad) {
691
0
      case PADDING_SAME_ZERO:
692
0
        convolve_no_maxpool_padding_zero(
693
0
            input, in_width, in_height, in_stride, layer_config, output,
694
0
            out_stride, start_idx, cstep, filter_width_half, filter_height_half,
695
0
            ii_shift, jj_shift, channel_step);
696
0
        break;
697
0
      case PADDING_SAME_REPLICATE:
698
0
        convolve_no_maxpool_padding_replicate(
699
0
            input, in_width, in_height, in_stride, layer_config, output,
700
0
            out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
701
0
        break;
702
0
      case PADDING_VALID:
703
0
        av1_cnn_convolve_no_maxpool_padding_valid(
704
0
            input, in_width, in_height, in_stride, layer_config, output,
705
0
            out_stride, start_idx, cstep, channel_step);
706
0
        break;
707
0
      default: assert(0 && "Unknown padding type");
708
0
    }
709
0
  }
710
0
}
711
712
0
static int convolve_layer(void *arg1, void *arg2) {
713
0
  const CONVOLVE_OPS *convolve_ops = arg1;
714
0
  (void)arg2;
715
0
  av1_cnn_convolve(
716
0
      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
717
0
      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
718
0
      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
719
0
  return 1;
720
0
}
721
722
static void convolve_layer_mt(const float **input, int in_width, int in_height,
723
                              int in_stride,
724
                              const CNN_LAYER_CONFIG *layer_config,
725
                              const CNN_THREAD_DATA *thread_data,
726
0
                              float **output, int out_stride) {
727
0
  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
728
0
  const int num_workers = thread_data->num_workers;
729
0
  assert(thread_data->workers);
730
731
0
  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
732
0
  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
733
0
    AVxWorker *const worker = &thread_data->workers[th];
734
0
    winterface->reset(worker);
735
736
0
    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
737
0
                                 in_stride,  layer_config, output,
738
0
                                 out_stride, th,           num_workers };
739
0
    convolve_ops[th] = convolve_op;
740
0
    worker->hook = convolve_layer;
741
0
    worker->data1 = &(convolve_ops[th]);
742
0
    worker->data2 = NULL;
743
744
    // Start convolving.
745
0
    if (th == num_workers - 1) {
746
0
      winterface->execute(worker);
747
0
    } else {
748
0
      winterface->launch(worker);
749
0
    }
750
0
  }
751
752
  // Wait until all workers have finished.
753
0
  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
754
0
    winterface->sync(&thread_data->workers[th]);
755
0
  }
756
0
}
757
758
0
static inline int get_start_shift_deconvolve(int filt_width, int stride) {
759
0
  const int dif = AOMMAX(filt_width - stride, 0);
760
0
  return dif / 2;
761
0
}
762
763
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
764
                         int stride, const float *gamma, const float *beta,
765
0
                         const float *mean, const float *std) {
766
0
  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
767
0
  for (int ch = 0; ch < channels; ch++) {
768
0
    const float ch_gamma = gamma[ch];
769
0
    const float ch_beta = beta[ch];
770
0
    const float ch_mean = mean[ch];
771
0
    const float ch_std = std[ch];
772
0
    float *image_row = image[ch];
773
774
0
    for (int row = 0; row < height; row++) {
775
0
      for (int col = 0; col < width; col++) {
776
0
        image_row[col] =
777
0
            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
778
0
      }
779
0
      image_row += stride;
780
0
    }
781
0
  }
782
0
}
783
784
void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
785
                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
786
0
                          float **output, int out_stride) {
787
0
  assert(layer_config->deconvolve);
788
789
0
  const int cstep = layer_config->in_channels * layer_config->out_channels;
790
791
0
  int out_width = 0;
792
0
  int out_height = 0;
793
0
  av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
794
0
                                 &out_height);
795
0
  switch (layer_config->pad) {
796
0
    case PADDING_SAME_ZERO:
797
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
798
0
        for (int u = 0; u < out_height; ++u) {
799
0
          for (int v = 0; v < out_width; ++v) {
800
0
            float sum = layer_config->bias[i];
801
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
802
0
              int off = k * layer_config->out_channels + i;
803
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
804
0
                const int h =
805
0
                    u - l +
806
0
                    get_start_shift_deconvolve(layer_config->filter_height,
807
0
                                               layer_config->skip_height);
808
0
                for (int m = 0; m < layer_config->filter_width;
809
0
                     ++m, off += cstep) {
810
0
                  const int w =
811
0
                      v - m +
812
0
                      get_start_shift_deconvolve(layer_config->filter_width,
813
0
                                                 layer_config->skip_width);
814
0
                  if ((h % layer_config->skip_height) != 0 ||
815
0
                      (w % layer_config->skip_width) != 0)
816
0
                    continue;
817
0
                  const int ii = h / layer_config->skip_height;
818
0
                  const int jj = w / layer_config->skip_width;
819
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
820
0
                    continue;
821
0
                  sum += layer_config->weights[off] *
822
0
                         input[k][ii * in_stride + jj];
823
0
                }
824
0
              }
825
0
            }
826
0
            output[i][u * out_stride + v] = sum;
827
0
          }
828
0
        }
829
0
      }
830
0
      break;
831
0
    case PADDING_SAME_REPLICATE:
832
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
833
0
        for (int u = 0; u < out_height; ++u) {
834
0
          for (int v = 0; v < out_width; ++v) {
835
0
            float sum = layer_config->bias[i];
836
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
837
0
              int off = k * layer_config->out_channels + i;
838
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
839
0
                const int h =
840
0
                    u - l +
841
0
                    get_start_shift_deconvolve(layer_config->filter_height,
842
0
                                               layer_config->skip_height);
843
0
                for (int m = 0; m < layer_config->filter_width;
844
0
                     ++m, off += cstep) {
845
0
                  const int w =
846
0
                      v - m +
847
0
                      get_start_shift_deconvolve(layer_config->filter_width,
848
0
                                                 layer_config->skip_width);
849
0
                  if ((h % layer_config->skip_height) != 0 ||
850
0
                      (w % layer_config->skip_width) != 0)
851
0
                    continue;
852
0
                  const int ii =
853
0
                      CLAMPINDEX(h / layer_config->skip_height, in_height);
854
0
                  const int jj =
855
0
                      CLAMPINDEX(w / layer_config->skip_width, in_width);
856
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
857
0
                  sum += layer_config->weights[off] *
858
0
                         input[k][ii * in_stride + jj];
859
0
                }
860
0
              }
861
0
            }
862
0
            output[i][u * out_stride + v] = sum;
863
0
          }
864
0
        }
865
0
      }
866
0
      break;
867
0
    case PADDING_VALID:
868
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
869
0
        for (int u = 0; u < out_height; ++u) {
870
0
          for (int v = 0; v < out_width; ++v) {
871
0
            float sum = layer_config->bias[i];
872
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
873
0
              int off = k * layer_config->out_channels + i;
874
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
875
0
                const int h = u - l;
876
0
                for (int m = 0; m < layer_config->filter_width;
877
0
                     ++m, off += cstep) {
878
0
                  const int w = v - m;
879
0
                  if ((h % layer_config->skip_height) != 0 ||
880
0
                      (w % layer_config->skip_width) != 0)
881
0
                    continue;
882
0
                  const int ii = h / layer_config->skip_height;
883
0
                  const int jj = w / layer_config->skip_width;
884
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
885
0
                    continue;
886
0
                  sum += layer_config->weights[off] *
887
0
                         input[k][ii * in_stride + jj];
888
0
                }
889
0
              }
890
0
            }
891
0
            output[i][u * out_stride + v] = sum;
892
0
          }
893
0
        }
894
0
      }
895
0
      break;
896
0
    default: assert(0 && "Unknown padding type");
897
0
  }
898
0
}
899
900
bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
901
                       int in_stride, const CNN_CONFIG *cnn_config,
902
                       const CNN_THREAD_DATA *thread_data,
903
0
                       CNN_MULTI_OUT *output_struct) {
904
0
  bool success = false;
905
0
  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
906
0
  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
907
908
0
  float **output[CNN_MAX_BRANCHES];
909
0
  const int *out_chs = output_struct->output_channels;
910
0
  output[0] = output_struct->output_buffer;
911
0
  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
912
0
    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
913
0
  }
914
915
0
  int i_width = in_width;
916
0
  int i_height = in_height;
917
0
  int o_width = 0, o_height = 0;
918
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
919
0
    init_tensor(&tensor1[b]);
920
0
    init_tensor(&tensor2[b]);
921
0
  }
922
923
0
  const int *out_stride = output_struct->output_strides;
924
0
  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
925
0
    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
926
0
    const int branch = layer_config->branch;
927
0
    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
928
929
    // Allocate input tensor
930
0
    if (layer == 0) {       // First layer
931
0
      assert(branch == 0);  // First layer must be primary branch
932
0
      assign_tensor(&tensor1[branch], (float **)input,
933
0
                    layer_config->in_channels, in_width, in_height, in_stride);
934
0
    } else {  // Non-first layer
935
      // Swap tensor1 and tensor2
936
0
      swap_tensor(&tensor1[branch], &tensor2[branch]);
937
938
0
      i_width = tensor1[branch].width;
939
0
      i_height = tensor1[branch].height;
940
0
    }
941
942
    // Allocate output tensor
943
0
    av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
944
0
                                   &o_height);
945
0
    const int output_num = layer_config->output_num;
946
0
    if (output_num == -1) {  // Non-output layer
947
0
      if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
948
0
                          o_height)) {
949
0
        goto Error;
950
0
      }
951
0
    } else {  // Output layer
952
0
      free_tensor(&tensor2[branch]);
953
0
      assign_tensor(&tensor2[branch], output[output_num],
954
0
                    layer_config->out_channels, o_width, o_height,
955
0
                    out_stride[output_num]);
956
0
    }
957
958
    // If we are combining branches make sure that the branch to combine
959
    // is different from the current branch.
960
0
    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
961
0
                   !(branch_config->branches_to_combine & (1 << branch))));
962
963
0
    if (layer_config->branch_copy_type == BRANCH_INPUT) {
964
0
      if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
965
0
                                          branch, tensor2)) {
966
0
        goto Error;
967
0
      }
968
0
    }
969
    // Check consistency of input and output channels
970
0
    assert(tensor1[branch].channels == layer_config->in_channels);
971
0
    assert(tensor2[branch].channels == layer_config->out_channels);
972
973
    // Convolve/Deconvolve
974
0
    if (!cnn_config->layer_config[layer].deconvolve) {
975
0
      if (thread_data->num_workers > 1) {
976
0
        convolve_layer_mt((const float **)tensor1[branch].buf,
977
0
                          tensor1[branch].width, tensor1[branch].height,
978
0
                          tensor1[branch].stride, layer_config, thread_data,
979
0
                          tensor2[branch].buf, tensor2[branch].stride);
980
0
      } else {
981
0
        av1_cnn_convolve((const float **)tensor1[branch].buf,
982
0
                         tensor1[branch].width, tensor1[branch].height,
983
0
                         tensor1[branch].stride, layer_config,
984
0
                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
985
0
      }
986
0
    } else {
987
0
      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
988
0
                         tensor1[branch].width, tensor1[branch].height,
989
0
                         tensor1[branch].stride, layer_config,
990
0
                         tensor2[branch].buf, tensor2[branch].stride);
991
0
    }
992
993
0
    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
994
0
      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
995
0
                                          branch, tensor2)) {
996
0
        goto Error;
997
0
      }
998
0
    }
999
1000
    // Add tensors from other branches if needed
1001
0
    if (layer_config->branch_combine_type == BRANCH_ADD) {
1002
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1003
0
        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1004
0
          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
1005
0
          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
1006
0
                      tensor2[branch].width, tensor2[branch].height,
1007
0
                      tensor2[branch].stride, (const float **)tensor2[b].buf);
1008
0
        }
1009
0
      }
1010
0
    }
1011
1012
    // Non-linearity
1013
0
    av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1014
0
                     tensor2[branch].width, tensor2[branch].height,
1015
0
                     tensor2[branch].stride, layer_config->activation);
1016
1017
0
    if (layer_config->bn_params.bn_gamma) {
1018
0
      av1_cnn_batchnorm(
1019
0
          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1020
0
          tensor2[branch].height, tensor2[branch].stride,
1021
0
          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1022
0
          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1023
0
    }
1024
1025
    // Concatenate tensors
1026
0
    if (layer_config->branch_combine_type == BRANCH_CAT) {
1027
0
      if (output_num == -1) {  // Non-output layer
1028
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1029
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1030
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1031
0
            assert(tensor2[b].channels > 0);
1032
0
            if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
1033
0
          }
1034
0
        }
1035
0
      } else {  // Output layer
1036
0
        const int existing_channels = tensor2[branch].channels;
1037
0
        int num_chs = existing_channels;
1038
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1039
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1040
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1041
            // Needed only to assign the new channel buffers
1042
0
            num_chs += tensor2[b].channels;
1043
0
          }
1044
0
        }
1045
0
        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1046
0
                      o_height, out_stride[output_num]);
1047
1048
0
        num_chs = existing_channels;
1049
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1050
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1051
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1052
            // Needed only to assign the new channel buffers
1053
0
            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1054
0
                        &tensor2[branch]);
1055
0
            num_chs += tensor2[b].channels;
1056
0
          }
1057
0
        }
1058
0
      }
1059
0
    }
1060
1061
0
    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1062
0
      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
1063
0
                                          branch, tensor2)) {
1064
0
        goto Error;
1065
0
      }
1066
0
    }
1067
0
  }
1068
1069
0
  success = true;
1070
0
Error:
1071
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1072
0
    free_tensor(&tensor1[b]);
1073
0
    free_tensor(&tensor2[b]);
1074
0
  }
1075
0
  return success;
1076
0
}
1077
1078
// Assume output already has proper allocation
1079
// Assume input image buffers all have same resolution and strides
1080
bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1081
                                   int stride, const CNN_CONFIG *cnn_config,
1082
                                   const CNN_THREAD_DATA *thread_data,
1083
0
                                   CNN_MULTI_OUT *output) {
1084
0
  const float max_val = 255.0;
1085
1086
0
  const int in_width = width + 2 * cnn_config->ext_width;
1087
0
  const int in_height = height + 2 * cnn_config->ext_height;
1088
0
  const int in_channels = cnn_config->layer_config[0].in_channels;
1089
0
  float *inputs[CNN_MAX_CHANNELS];
1090
0
  float *input_ =
1091
0
      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1092
0
  if (!input_) return false;
1093
0
  const int in_stride = in_width;
1094
1095
0
  for (int c = 0; c < in_channels; ++c) {
1096
0
    inputs[c] = input_ + c * in_stride * in_height;
1097
0
    float *input =
1098
0
        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1099
1100
0
    if (cnn_config->strict_bounds) {
1101
0
      for (int i = 0; i < height; ++i)
1102
0
        for (int j = 0; j < width; ++j)
1103
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1104
      // extend left and right
1105
0
      for (int i = 0; i < height; ++i) {
1106
0
        for (int j = -cnn_config->ext_width; j < 0; ++j)
1107
0
          input[i * in_stride + j] = input[i * in_stride];
1108
0
        for (int j = width; j < width + cnn_config->ext_width; ++j)
1109
0
          input[i * in_stride + j] = input[i * in_stride + width - 1];
1110
0
      }
1111
      // extend top and bottom
1112
0
      for (int i = -cnn_config->ext_height; i < 0; ++i)
1113
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1114
0
               &input[-cnn_config->ext_width], in_width * sizeof(*input));
1115
0
      for (int i = height; i < height + cnn_config->ext_height; ++i)
1116
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1117
0
               &input[(height - 1) * in_stride - cnn_config->ext_width],
1118
0
               in_width * sizeof(*input));
1119
0
    } else {
1120
0
      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1121
0
           ++i)
1122
0
        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1123
0
             ++j)
1124
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1125
0
    }
1126
0
  }
1127
0
  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1128
0
                                 in_stride, cnn_config, thread_data, output);
1129
1130
0
  aom_free(input_);
1131
0
  return success;
1132
0
}
1133
1134
// Assume output already has proper allocation
1135
// Assume input image buffers all have same resolution and strides
1136
bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1137
                                          int stride,
1138
                                          const CNN_CONFIG *cnn_config,
1139
                                          const CNN_THREAD_DATA *thread_data,
1140
                                          int bit_depth,
1141
0
                                          CNN_MULTI_OUT *output) {
1142
0
  const float max_val = (float)((1 << bit_depth) - 1);
1143
1144
0
  const int in_width = width + 2 * cnn_config->ext_width;
1145
0
  const int in_height = height + 2 * cnn_config->ext_height;
1146
0
  const int in_channels = cnn_config->layer_config[0].in_channels;
1147
0
  float *inputs[CNN_MAX_CHANNELS];
1148
0
  float *input_ =
1149
0
      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1150
0
  if (!input_) return false;
1151
0
  const int in_stride = in_width;
1152
1153
0
  for (int c = 0; c < in_channels; ++c) {
1154
0
    inputs[c] = input_ + c * in_stride * in_height;
1155
0
    float *input =
1156
0
        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1157
1158
0
    if (cnn_config->strict_bounds) {
1159
0
      for (int i = 0; i < height; ++i)
1160
0
        for (int j = 0; j < width; ++j)
1161
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1162
      // extend left and right
1163
0
      for (int i = 0; i < height; ++i) {
1164
0
        for (int j = -cnn_config->ext_width; j < 0; ++j)
1165
0
          input[i * in_stride + j] = input[i * in_stride];
1166
0
        for (int j = width; j < width + cnn_config->ext_width; ++j)
1167
0
          input[i * in_stride + j] = input[i * in_stride + width - 1];
1168
0
      }
1169
      // extend top and bottom
1170
0
      for (int i = -cnn_config->ext_height; i < 0; ++i)
1171
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1172
0
               &input[-cnn_config->ext_width], in_width * sizeof(*input));
1173
0
      for (int i = height; i < height + cnn_config->ext_height; ++i)
1174
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1175
0
               &input[(height - 1) * in_stride - cnn_config->ext_width],
1176
0
               in_width * sizeof(*input));
1177
0
    } else {
1178
0
      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1179
0
           ++i)
1180
0
        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1181
0
             ++j)
1182
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1183
0
    }
1184
0
  }
1185
1186
0
  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1187
0
                                 in_stride, cnn_config, thread_data, output);
1188
1189
0
  aom_free(input_);
1190
0
  return success;
1191
0
}