Coverage Report

Created: 2022-08-24 06:17

/src/aom/av1/encoder/cnn.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <math.h>
14
15
#include "aom_dsp/aom_dsp_common.h"
16
#include "av1/common/av1_common_int.h"
17
#include "av1/encoder/cnn.h"
18
19
0
#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
20
21
typedef struct {
22
  const float **input;
23
  int in_width;
24
  int in_height;
25
  int in_stride;
26
  const CNN_LAYER_CONFIG *layer_config;
27
  float **output;
28
  int out_stride;
29
  int start_idx;
30
  int th_step;
31
} CONVOLVE_OPS;
32
33
typedef float (*activation_fn)(float);
34
35
0
static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
36
37
40.7M
static float relu(float x) { return (x < 0) ? 0 : x; }
38
39
0
static float identity(float x) { return x; }
40
41
typedef struct {
42
  int allocsize;
43
  int channels;
44
  int width, height, stride;
45
  float *buf[CNN_MAX_CHANNELS];
46
} TENSOR;
47
48
48.6k
static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
49
50
79.0k
static void free_tensor(TENSOR *tensor) {
51
79.0k
  if (tensor->allocsize) {
52
6.08k
    aom_free(tensor->buf[0]);
53
6.08k
    tensor->buf[0] = NULL;
54
6.08k
    tensor->allocsize = 0;
55
6.08k
  }
56
79.0k
}
57
58
static void realloc_tensor(TENSOR *tensor, int channels, int width,
59
6.08k
                           int height) {
60
6.08k
  const int newallocsize = channels * width * height;
61
6.08k
  if (tensor->allocsize < newallocsize) {
62
6.08k
    free_tensor(tensor);
63
6.08k
    tensor->buf[0] =
64
6.08k
        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
65
6.08k
    tensor->allocsize = newallocsize;
66
6.08k
  }
67
6.08k
  tensor->width = width;
68
6.08k
  tensor->height = height;
69
6.08k
  tensor->stride = width;
70
6.08k
  tensor->channels = channels;
71
121k
  for (int c = 1; c < channels; ++c)
72
115k
    tensor->buf[c] = &tensor->buf[0][c * width * height];
73
6.08k
}
74
75
static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
76
0
                        TENSOR *dst) {
77
0
  assert(src->width == dst->width);
78
0
  assert(src->height == dst->height);
79
0
  assert(copy_channels <= src->channels);
80
0
  if (src->stride == dst->width && dst->stride == dst->width) {
81
0
    for (int c = 0; c < copy_channels; ++c) {
82
0
      memcpy(dst->buf[dst_offset + c], src->buf[c],
83
0
             sizeof(*dst->buf[0]) * src->width * src->height);
84
0
    }
85
0
  } else {
86
0
    for (int c = 0; c < copy_channels; ++c) {
87
0
      for (int r = 0; r < dst->height; ++r) {
88
0
        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
89
0
               &src->buf[c][r * src->stride],
90
0
               dst->width * sizeof(*dst->buf[c]));
91
0
      }
92
0
    }
93
0
  }
94
0
}
95
96
static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
97
30.4k
                          int channels, int width, int height, int stride) {
98
30.4k
  tensor->allocsize = 0;
99
30.4k
  tensor->channels = channels;
100
30.4k
  tensor->width = width;
101
30.4k
  tensor->height = height;
102
30.4k
  tensor->stride = stride;
103
30.4k
  if (buf) {
104
425k
    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
105
30.4k
  } else {
106
0
    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
107
0
  }
108
30.4k
}
109
110
24.3k
static void swap_tensor(TENSOR *t1, TENSOR *t2) {
111
24.3k
  TENSOR t = *t1;
112
24.3k
  *t1 = *t2;
113
24.3k
  *t2 = t;
114
24.3k
}
115
116
// The concatenated tensor goes into dst with first the channels in
117
// original dst followed by the channels in the src
118
0
static void concat_tensor(const TENSOR *src, TENSOR *dst) {
119
0
  assert(src->width == dst->width);
120
0
  assert(src->height == dst->height);
121
122
0
  const int dst_channels = dst->channels;
123
0
  const int channels = dst->channels + src->channels;
124
0
  const int newallocsize = channels * dst->width * dst->height;
125
0
  if (dst->allocsize < newallocsize) {
126
0
    TENSOR t;
127
0
    init_tensor(&t);
128
    // allocate new buffers and copy first the dst channels
129
0
    realloc_tensor(&t, channels, dst->width, dst->height);
130
0
    copy_tensor(dst, dst->channels, 0, &t);
131
    // Swap the tensors and free the old buffers
132
0
    swap_tensor(dst, &t);
133
0
    free_tensor(&t);
134
0
  }
135
0
  for (int c = 1; c < channels; ++c)
136
0
    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
137
  // Copy the channels in src after the first dst_channels channels.
138
0
  copy_tensor(src, src->channels, dst_channels, dst);
139
0
}
140
141
0
int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
142
0
  return (t1->width == t2->width && t1->height == t2->height);
143
0
}
144
145
0
int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
146
0
  return (t1->channels == t2->channels && t1->width == t2->width &&
147
0
          t1->height == t2->height);
148
0
}
149
150
void av1_find_cnn_layer_output_size(int in_width, int in_height,
151
                                    const CNN_LAYER_CONFIG *layer_config,
152
30.4k
                                    int *out_width, int *out_height) {
153
30.4k
  if (!layer_config->deconvolve) {
154
30.4k
    switch (layer_config->pad) {
155
0
      case PADDING_SAME_ZERO:
156
0
      case PADDING_SAME_REPLICATE:
157
0
        *out_width = (in_width + layer_config->skip_width - 1) /
158
0
                     layer_config->skip_width;
159
0
        *out_height = (in_height + layer_config->skip_height - 1) /
160
0
                      layer_config->skip_height;
161
0
        break;
162
30.4k
      case PADDING_VALID:
163
30.4k
        *out_width =
164
30.4k
            (in_width - layer_config->filter_width + layer_config->skip_width) /
165
30.4k
            layer_config->skip_width;
166
30.4k
        *out_height = (in_height - layer_config->filter_height +
167
30.4k
                       layer_config->skip_height) /
168
30.4k
                      layer_config->skip_height;
169
30.4k
        break;
170
0
      default: assert(0 && "Unknown padding type");
171
30.4k
    }
172
30.4k
  } else {
173
0
    switch (layer_config->pad) {
174
0
      case PADDING_SAME_ZERO:
175
0
      case PADDING_SAME_REPLICATE:
176
0
        *out_width = in_width * layer_config->skip_width;
177
0
        *out_height = in_height * layer_config->skip_height;
178
0
        break;
179
0
      case PADDING_VALID:
180
0
        *out_width = (in_width - 1) * layer_config->skip_width +
181
0
                     layer_config->filter_width;
182
0
        *out_height = (in_height - 1) * layer_config->skip_height +
183
0
                      layer_config->filter_height;
184
0
        break;
185
0
      default: assert(0 && "Unknown padding type");
186
0
    }
187
0
  }
188
30.4k
}
189
190
void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
191
0
                           int channels_per_branch[]) {
192
0
  int branch = layer_config->branch;
193
0
  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
194
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
195
0
    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
196
0
      if (layer_config->branch_copy_type == BRANCH_INPUT) {
197
0
        channels_per_branch[b] = layer_config->in_channels;
198
0
      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
199
0
        channels_per_branch[b] = layer_config->out_channels;
200
0
      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
201
0
        channels_per_branch[b] = layer_config->out_channels;
202
0
        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
203
0
          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
204
0
            assert(channels_per_branch[c] > 0);
205
0
            channels_per_branch[b] += channels_per_branch[c];
206
0
          }
207
0
        }
208
0
      }
209
0
    }
210
0
  }
211
0
  channels_per_branch[branch] = layer_config->out_channels;
212
0
  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
213
0
    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
214
0
      assert(channels_per_branch[c] > 0);
215
0
      channels_per_branch[branch] += channels_per_branch[c];
216
0
    }
217
0
  }
218
0
}
219
220
#if CONFIG_DEBUG
221
static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
222
  const int num_layers = cnn_config->num_layers;
223
  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
224
225
  for (int idx = 0; idx < num_layers; idx++) {
226
    if (layer_configs[idx].output_num != -1) {
227
      return 1;
228
    }
229
  }
230
  return 0;
231
}
232
#endif
233
234
void av1_find_cnn_output_size(int in_width, int in_height,
235
                              const CNN_CONFIG *cnn_config, int *out_width,
236
0
                              int *out_height, int *out_channels) {
237
0
  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
238
0
  int i_width[CNN_MAX_BRANCHES] = { 0 };
239
0
  int i_height[CNN_MAX_BRANCHES] = { 0 };
240
0
  i_width[0] = in_width + cnn_config->ext_width * 2;
241
0
  i_height[0] = in_height + cnn_config->ext_height * 2;
242
243
#if CONFIG_DEBUG
244
  assert(cnn_has_at_least_one_output(cnn_config));
245
#endif
246
247
0
  for (int i = 0; i < cnn_config->num_layers; ++i) {
248
0
    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
249
0
    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
250
0
    const int branch = layer_config->branch;
251
0
    int o_width = 0, o_height = 0;
252
253
0
    if (layer_config->branch_copy_type == BRANCH_INPUT) {
254
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
255
0
        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
256
0
          assert(i_width[branch] > 0 && i_height[branch] > 0);
257
0
          i_width[b] = i_width[branch];
258
0
          i_height[b] = i_height[branch];
259
0
        }
260
0
      }
261
0
    }
262
263
0
    av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
264
0
                                   layer_config, &o_width, &o_height);
265
0
    i_width[branch] = o_width;
266
0
    i_height[branch] = o_height;
267
268
0
    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
269
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
270
0
        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
271
0
          i_width[b] = o_width;
272
0
          i_height[b] = o_height;
273
0
        }
274
0
      }
275
0
    }
276
277
0
    find_cnn_out_channels(layer_config, channels_per_branch);
278
279
0
    const int output_num = layer_config->output_num;
280
0
    if (output_num != -1) {  // Current layer is an output layer
281
0
      out_width[output_num] = o_width;
282
0
      out_height[output_num] = o_height;
283
0
      out_channels[output_num] = channels_per_branch[layer_config->branch];
284
0
    }
285
0
  }
286
0
}
287
288
30.3k
activation_fn get_activation(ACTIVATION layer_activation) {
289
30.3k
  switch (layer_activation) {
290
0
    case NONE: return identity;
291
30.4k
    case RELU: return relu;
292
0
    case SOFTSIGN: return softsign;
293
0
    case SIGMOID:
294
0
      assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
295
0
      return NULL;
296
0
    default: assert(0 && "Unknown activation type"); return NULL;
297
30.3k
  }
298
30.3k
}
299
300
static INLINE int get_start_shift_convolve(int width, int filt_width,
301
0
                                           int stride) {
302
0
  const int mod = (width % stride);
303
0
  const int filt_off = (filt_width - 1) / 2;
304
0
  const int dif = (mod ? mod - 1 : stride - 1);
305
0
  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
306
0
}
307
308
void av1_cnn_add_c(float **output, int channels, int width, int height,
309
0
                   int stride, const float **add) {
310
0
  for (int c = 0; c < channels; ++c) {
311
0
    for (int i = 0; i < height; ++i)
312
0
      for (int j = 0; j < width; ++j)
313
0
        output[c][i * stride + j] += add[c][i * stride + j];
314
0
  }
315
0
}
316
317
void av1_cnn_activate_c(float **output, int channels, int width, int height,
318
30.3k
                        int stride, ACTIVATION layer_activation) {
319
30.3k
  activation_fn activation = get_activation(layer_activation);
320
540k
  for (int c = 0; c < channels; ++c) {
321
4.06M
    for (int i = 0; i < height; ++i)
322
44.3M
      for (int j = 0; j < width; ++j)
323
40.7M
        output[c][i * stride + j] = activation(output[c][i * stride + j]);
324
510k
  }
325
30.3k
}
326
327
static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
328
                                           const CNN_LAYER_CONFIG *layer_config,
329
0
                                           int branch, TENSOR branch_output[]) {
330
0
  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
331
0
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
332
0
    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
333
      // Copy layer's active tensor to output tensor of branch b if set in
334
      // mask. The output becomes the input of the first layer of the branch
335
      // because the layer of the branch is not the first layer.
336
0
      int copy_channels = branch_config->channels_to_copy > 0
337
0
                              ? branch_config->channels_to_copy
338
0
                              : layer_active_tensor->channels;
339
0
      realloc_tensor(&branch_output[b], copy_channels,
340
0
                     layer_active_tensor->width, layer_active_tensor->height);
341
0
      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
342
0
    }
343
0
  }
344
0
}
345
346
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
347
// greater than 1 and padding equal to PADDING_SAME_ZERO.
348
static void convolve_maxpool_padding_zero(
349
    const float **input, int in_width, int in_height, int in_stride,
350
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
351
    const int cstep, const int filter_width_half,
352
0
    const int filter_height_half) {
353
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
354
0
    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
355
0
      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
356
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
357
0
             ++hh) {
358
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
359
0
               ++ww) {
360
0
            float sum = layer_config->bias[i];
361
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
362
0
              int off = k * layer_config->out_channels + i;
363
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
364
0
                const int ii = hh + l - filter_height_half;
365
0
                for (int m = 0; m < layer_config->filter_width;
366
0
                     ++m, off += cstep) {
367
0
                  const int jj = ww + m - filter_width_half;
368
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
369
0
                    continue;
370
0
                  sum += layer_config->weights[off] *
371
0
                         input[k][ii * in_stride + jj];
372
0
                }
373
0
              }
374
0
            }
375
0
            const float a = sum;
376
0
            if (h == hh && w == ww)
377
0
              output[i][u * out_stride + v] = a;
378
0
            else
379
0
              output[i][u * out_stride + v] =
380
0
                  AOMMAX(output[i][u * out_stride + v], a);
381
0
          }
382
0
        }
383
0
      }
384
0
    }
385
0
  }
386
0
}
387
388
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
389
// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
390
static void convolve_maxpool_padding_replicate(
391
    const float **input, int in_width, int in_height, int in_stride,
392
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
393
    const int cstep, const int filter_width_half,
394
0
    const int filter_height_half) {
395
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
396
0
    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
397
0
      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
398
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
399
0
             ++hh) {
400
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
401
0
               ++ww) {
402
0
            float sum = layer_config->bias[i];
403
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
404
0
              int off = k * layer_config->out_channels + i;
405
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
406
0
                const int ii =
407
0
                    CLAMPINDEX(hh + l - filter_height_half, in_height);
408
0
                for (int m = 0; m < layer_config->filter_width;
409
0
                     ++m, off += cstep) {
410
0
                  const int jj =
411
0
                      CLAMPINDEX(ww + m - filter_width_half, in_width);
412
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
413
0
                  sum += layer_config->weights[off] *
414
0
                         input[k][ii * in_stride + jj];
415
0
                }
416
0
              }
417
0
            }
418
0
            const float a = sum;
419
0
            if (h == hh && w == ww)
420
0
              output[i][u * out_stride + v] = a;
421
0
            else
422
0
              output[i][u * out_stride + v] =
423
0
                  AOMMAX(output[i][u * out_stride + v], a);
424
0
          }
425
0
        }
426
0
      }
427
0
    }
428
0
  }
429
0
}
430
431
// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
432
// greater than 1 and padding equal to PADDING_VALID.
433
static void convolve_maxpool_padding_valid(
434
    const float **input, int in_width, int in_height, int in_stride,
435
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
436
0
    const int cstep) {
437
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
438
0
    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
439
0
         h += layer_config->skip_height, ++u) {
440
0
      for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
441
0
           w += layer_config->skip_width, ++v) {
442
0
        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
443
0
             ++hh) {
444
0
          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
445
0
               ++ww) {
446
0
            float sum = layer_config->bias[i];
447
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
448
0
              int off = k * layer_config->out_channels + i;
449
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
450
0
                const int ii = hh + l;
451
0
                for (int m = 0; m < layer_config->filter_width;
452
0
                     ++m, off += cstep) {
453
0
                  const int jj = ww + m;
454
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
455
0
                  sum += layer_config->weights[off] *
456
0
                         input[k][ii * in_stride + jj];
457
0
                }
458
0
              }
459
0
            }
460
0
            const float a = sum;
461
0
            if (h == hh && w == ww)
462
0
              output[i][u * out_stride + v] = a;
463
0
            else
464
0
              output[i][u * out_stride + v] =
465
0
                  AOMMAX(output[i][u * out_stride + v], a);
466
0
          }
467
0
        }
468
0
      }
469
0
    }
470
0
  }
471
0
}
472
473
// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
474
// equal to 1.
475
static void convolve_element_wise(const float **input, int in_width,
476
                                  int in_height, int in_stride,
477
                                  const CNN_LAYER_CONFIG *const layer_config,
478
                                  float **output, int out_stride, int start_idx,
479
0
                                  int step) {
480
0
  const int start_h = get_start_shift_convolve(
481
0
      in_height, layer_config->filter_height, layer_config->skip_height);
482
0
  const int start_w =
483
0
      get_start_shift_convolve(in_width, layer_config->filter_width,
484
0
                               layer_config->skip_width) +
485
0
      start_idx * layer_config->skip_width;
486
0
  const int out_w_step = AOMMAX(step, 1);
487
0
  const int in_w_step = layer_config->skip_width * out_w_step;
488
0
  for (int i = 0; i < layer_config->out_channels; ++i) {
489
0
    for (int h = start_h, u = 0; h < in_height;
490
0
         h += layer_config->skip_height, ++u) {
491
0
      const int in_h = h * in_stride;
492
0
      const int out_h = u * out_stride + start_idx;
493
0
      for (int w = start_w, out_index = out_h; w < in_width;
494
0
           w += in_w_step, out_index += out_w_step) {
495
0
        float sum = layer_config->bias[i];
496
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
497
0
          sum += layer_config->weights[k * layer_config->out_channels + i] *
498
0
                 input[k][in_h + w];
499
0
        }
500
0
        output[i][out_index] = sum;
501
0
      }
502
0
    }
503
0
  }
504
0
}
505
506
// CNNConvolve specific to maxpool set as 0 and padding equal to
507
// PADDING_SAME_ZERO.
508
static void convolve_no_maxpool_padding_zero(
509
    const float **input, int in_width, int in_height, int in_stride,
510
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
511
    int start_idx, const int cstep, const int filter_width_half,
512
    const int filter_height_half, const int ii_shift, const int jj_shift,
513
0
    const int channel_step) {
514
0
  const int start_h = get_start_shift_convolve(
515
0
      in_height, layer_config->filter_height, layer_config->skip_height);
516
0
  const int start_w = get_start_shift_convolve(
517
0
      in_width, layer_config->filter_width, layer_config->skip_width);
518
0
  const int end_ii_shift = filter_height_half + 1;
519
0
  const int end_jj_shift = filter_width_half + 1;
520
  // *_filter_margin stores the number of pixels along a dimension in the
521
  // intersection of the complement of the image in the extended image
522
  // and the filter.
523
0
  const int top_filter_margin = layer_config->filter_width * ii_shift;
524
0
  const int right_filter_margin = end_jj_shift - in_width;
525
0
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
526
0
    for (int h = start_h, u = 0; h < in_height;
527
0
         h += layer_config->skip_height, ++u) {
528
0
      const int out_h = u * out_stride;
529
0
      const int top_cstep =
530
0
          AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
531
0
              cstep +
532
0
          i;
533
0
      const int start_ii = AOMMAX(0, h - ii_shift);
534
0
      const int end_ii = AOMMIN(in_height, h + end_ii_shift);
535
0
      for (int w = start_w, out_index = out_h; w < in_width;
536
0
           w += layer_config->skip_width, ++out_index) {
537
0
        const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
538
0
        const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
539
0
        const int start_jj = AOMMAX(0, w - jj_shift);
540
0
        const int end_jj = AOMMIN(in_width, w + end_jj_shift);
541
0
        float sum = layer_config->bias[i];
542
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
543
0
          int off = k * layer_config->out_channels + top_cstep;
544
0
          for (int ii = start_ii; ii < end_ii; ++ii) {
545
0
            off += left_cstep;
546
0
            for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
547
0
              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
548
0
            }
549
0
            off += right_cstep;
550
0
          }
551
0
        }
552
0
        output[i][out_index] = sum;
553
0
      }
554
0
    }
555
0
  }
556
0
}
557
558
// CNNConvolve specific to maxpool set as 0 and padding equal to
559
// PADDING_SAME_REPLICATE.
560
static void convolve_no_maxpool_padding_replicate(
561
    const float **input, int in_width, int in_height, int in_stride,
562
    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
563
    int start_idx, const int cstep, const int ii_shift, const int jj_shift,
564
0
    const int channel_step) {
565
  // h and w are shifted to an offset coordinate system to reduce in-loop
566
  // computation.
567
0
  const int start_h =
568
0
      get_start_shift_convolve(in_height, layer_config->filter_height,
569
0
                               layer_config->skip_height) -
570
0
      ii_shift;
571
0
  const int start_w =
572
0
      get_start_shift_convolve(in_width, layer_config->filter_width,
573
0
                               layer_config->skip_width) -
574
0
      jj_shift;
575
0
  const int end_h = in_height - ii_shift;
576
0
  const int end_w = in_width - jj_shift;
577
0
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
578
0
    for (int h = start_h, u = 0; h < end_h;
579
0
         h += layer_config->skip_height, ++u) {
580
0
      const int out_h = u * out_stride;
581
0
      const int upper_ii_index = layer_config->filter_height + h;
582
0
      for (int w = start_w, out_index = out_h; w < end_w;
583
0
           w += layer_config->skip_width, ++out_index) {
584
0
        const int upper_jj_index = layer_config->filter_width + w;
585
0
        float sum = layer_config->bias[i];
586
0
        for (int k = 0; k < layer_config->in_channels; ++k) {
587
0
          int off = k * layer_config->out_channels + i;
588
0
          for (int ii = h; ii < upper_ii_index; ++ii) {
589
0
            const int clamped_ii = CLAMPINDEX(ii, in_height);
590
0
            for (int jj = w; jj < upper_jj_index; ++jj) {
591
0
              const int clamped_jj = CLAMPINDEX(jj, in_width);
592
0
              assert(clamped_ii >= 0 && clamped_ii < in_height &&
593
0
                     clamped_jj >= 0 && clamped_jj < in_width);
594
0
              sum += layer_config->weights[off] *
595
0
                     input[k][clamped_ii * in_stride + clamped_jj];
596
0
              off += cstep;
597
0
            }
598
0
          }
599
0
        }
600
0
        output[i][out_index] = sum;
601
0
      }
602
0
    }
603
0
  }
604
0
}
605
606
// CNNConvolve specific to maxpool set as 0 and padding equal to
607
// PADDING_VALID.
608
void av1_cnn_convolve_no_maxpool_padding_valid_c(
609
    const float **input, int in_width, int in_height, int in_stride,
610
    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
611
30.4k
    int start_idx, int cstep, int channel_step) {
612
30.4k
  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
613
30.4k
         !layer_config->maxpool);
614
30.4k
  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
615
30.4k
  assert(layer_config->pad == PADDING_VALID);
616
540k
  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
617
4.04M
    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
618
3.53M
         h += layer_config->skip_height, ++u) {
619
3.53M
      const int out_h = u * out_stride;
620
3.53M
      const int upper_ii_index = layer_config->filter_height + h;
621
3.53M
      for (int w = 0, out_index = out_h;
622
42.2M
           w < in_width - layer_config->filter_width + 1;
623
38.7M
           w += layer_config->skip_width, ++out_index) {
624
38.7M
        const int upper_jj_index = layer_config->filter_width + w;
625
38.7M
        float sum = layer_config->bias[i];
626
233M
        for (int k = 0; k < layer_config->in_channels; ++k) {
627
195M
          int off = k * layer_config->out_channels + i;
628
652M
          for (int ii = h; ii < upper_ii_index; ++ii) {
629
1.75G
            for (int jj = w; jj < upper_jj_index; ++jj) {
630
1.29G
              assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
631
1.29G
              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
632
1.29G
              off += cstep;
633
1.29G
            }
634
457M
          }
635
195M
        }
636
38.7M
        output[i][out_index] = sum;
637
38.7M
      }
638
3.53M
    }
639
509k
  }
640
30.4k
}
641
642
static void av1_cnn_convolve(const float **input, int in_width, int in_height,
643
                             int in_stride,
644
                             const CNN_LAYER_CONFIG *layer_config,
645
                             float **output, int out_stride, int start_idx,
646
30.4k
                             int step) {
647
30.4k
  assert(!layer_config->deconvolve);
648
30.4k
  const int cstep = layer_config->in_channels * layer_config->out_channels;
649
30.4k
  const int filter_height_half = layer_config->filter_height >> 1;
650
30.4k
  const int filter_width_half = layer_config->filter_width >> 1;
651
30.4k
  const int channel_step = AOMMAX(step, 1);
652
653
30.4k
  if (layer_config->maxpool &&
654
30.4k
      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
655
0
    switch (layer_config->pad) {
656
0
      case PADDING_SAME_ZERO:
657
0
        convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
658
0
                                      layer_config, output, out_stride, cstep,
659
0
                                      filter_width_half, filter_height_half);
660
0
        break;
661
0
      case PADDING_SAME_REPLICATE:
662
0
        convolve_maxpool_padding_replicate(
663
0
            input, in_width, in_height, in_stride, layer_config, output,
664
0
            out_stride, cstep, filter_width_half, filter_height_half);
665
0
        break;
666
0
      case PADDING_VALID:
667
0
        convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
668
0
                                       layer_config, output, out_stride, cstep);
669
0
        break;
670
0
      default: assert(0 && "Unknown padding type");
671
0
    }
672
30.4k
  } else {
673
    // Results in element-wise matrix multiplication.
674
30.4k
    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
675
0
      convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
676
0
                            output, out_stride, start_idx, step);
677
0
      return;
678
0
    }
679
30.4k
    const int ii_shift =
680
30.4k
        filter_height_half - (layer_config->filter_height - 1) % 2;
681
30.4k
    const int jj_shift =
682
30.4k
        filter_width_half - (layer_config->filter_width - 1) % 2;
683
30.4k
    switch (layer_config->pad) {
684
0
      case PADDING_SAME_ZERO:
685
0
        convolve_no_maxpool_padding_zero(
686
0
            input, in_width, in_height, in_stride, layer_config, output,
687
0
            out_stride, start_idx, cstep, filter_width_half, filter_height_half,
688
0
            ii_shift, jj_shift, channel_step);
689
0
        break;
690
0
      case PADDING_SAME_REPLICATE:
691
0
        convolve_no_maxpool_padding_replicate(
692
0
            input, in_width, in_height, in_stride, layer_config, output,
693
0
            out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
694
0
        break;
695
30.4k
      case PADDING_VALID:
696
30.4k
        av1_cnn_convolve_no_maxpool_padding_valid(
697
30.4k
            input, in_width, in_height, in_stride, layer_config, output,
698
30.4k
            out_stride, start_idx, cstep, channel_step);
699
30.4k
        break;
700
0
      default: assert(0 && "Unknown padding type");
701
30.4k
    }
702
30.4k
  }
703
30.4k
}
704
705
0
static int convolve_layer(void *arg1, void *arg2) {
706
0
  const CONVOLVE_OPS *convolve_ops = arg1;
707
0
  (void)arg2;
708
0
  av1_cnn_convolve(
709
0
      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
710
0
      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
711
0
      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
712
0
  return 1;
713
0
}
714
715
static void convolve_layer_mt(const float **input, int in_width, int in_height,
716
                              int in_stride,
717
                              const CNN_LAYER_CONFIG *layer_config,
718
                              const CNN_THREAD_DATA *thread_data,
719
0
                              float **output, int out_stride) {
720
0
  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
721
0
  const int num_workers = thread_data->num_workers;
722
723
0
  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
724
0
  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
725
0
    AVxWorker *const worker = &thread_data->workers[th];
726
0
    winterface->reset(worker);
727
728
0
    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
729
0
                                 in_stride,  layer_config, output,
730
0
                                 out_stride, th,           num_workers };
731
0
    convolve_ops[th] = convolve_op;
732
0
    worker->hook = convolve_layer;
733
0
    worker->data1 = &(convolve_ops[th]);
734
0
    worker->data2 = NULL;
735
736
    // Start convolving.
737
0
    if (th == num_workers - 1) {
738
0
      winterface->execute(worker);
739
0
    } else {
740
0
      winterface->launch(worker);
741
0
    }
742
0
  }
743
744
  // Wait until all workers have finished.
745
0
  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
746
0
    winterface->sync(&thread_data->workers[th]);
747
0
  }
748
0
}
749
750
0
static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
751
0
  const int dif = AOMMAX(filt_width - stride, 0);
752
0
  return dif / 2;
753
0
}
754
755
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
756
                         int stride, const float *gamma, const float *beta,
757
0
                         const float *mean, const float *std) {
758
0
  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
759
0
  for (int ch = 0; ch < channels; ch++) {
760
0
    const float ch_gamma = gamma[ch];
761
0
    const float ch_beta = beta[ch];
762
0
    const float ch_mean = mean[ch];
763
0
    const float ch_std = std[ch];
764
0
    float *image_row = image[ch];
765
766
0
    for (int row = 0; row < height; row++) {
767
0
      for (int col = 0; col < width; col++) {
768
0
        image_row[col] =
769
0
            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
770
0
      }
771
0
      image_row += stride;
772
0
    }
773
0
  }
774
0
}
775
776
void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
777
                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
778
0
                          float **output, int out_stride) {
779
0
  assert(layer_config->deconvolve);
780
781
0
  const int cstep = layer_config->in_channels * layer_config->out_channels;
782
783
0
  int out_width = 0;
784
0
  int out_height = 0;
785
0
  av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
786
0
                                 &out_height);
787
0
  switch (layer_config->pad) {
788
0
    case PADDING_SAME_ZERO:
789
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
790
0
        for (int u = 0; u < out_height; ++u) {
791
0
          for (int v = 0; v < out_width; ++v) {
792
0
            float sum = layer_config->bias[i];
793
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
794
0
              int off = k * layer_config->out_channels + i;
795
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
796
0
                const int h =
797
0
                    u - l +
798
0
                    get_start_shift_deconvolve(layer_config->filter_height,
799
0
                                               layer_config->skip_height);
800
0
                for (int m = 0; m < layer_config->filter_width;
801
0
                     ++m, off += cstep) {
802
0
                  const int w =
803
0
                      v - m +
804
0
                      get_start_shift_deconvolve(layer_config->filter_width,
805
0
                                                 layer_config->skip_width);
806
0
                  if ((h % layer_config->skip_height) != 0 ||
807
0
                      (w % layer_config->skip_width) != 0)
808
0
                    continue;
809
0
                  const int ii = h / layer_config->skip_height;
810
0
                  const int jj = w / layer_config->skip_width;
811
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
812
0
                    continue;
813
0
                  sum += layer_config->weights[off] *
814
0
                         input[k][ii * in_stride + jj];
815
0
                }
816
0
              }
817
0
            }
818
0
            output[i][u * out_stride + v] = sum;
819
0
          }
820
0
        }
821
0
      }
822
0
      break;
823
0
    case PADDING_SAME_REPLICATE:
824
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
825
0
        for (int u = 0; u < out_height; ++u) {
826
0
          for (int v = 0; v < out_width; ++v) {
827
0
            float sum = layer_config->bias[i];
828
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
829
0
              int off = k * layer_config->out_channels + i;
830
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
831
0
                const int h =
832
0
                    u - l +
833
0
                    get_start_shift_deconvolve(layer_config->filter_height,
834
0
                                               layer_config->skip_height);
835
0
                for (int m = 0; m < layer_config->filter_width;
836
0
                     ++m, off += cstep) {
837
0
                  const int w =
838
0
                      v - m +
839
0
                      get_start_shift_deconvolve(layer_config->filter_width,
840
0
                                                 layer_config->skip_width);
841
0
                  if ((h % layer_config->skip_height) != 0 ||
842
0
                      (w % layer_config->skip_width) != 0)
843
0
                    continue;
844
0
                  const int ii =
845
0
                      CLAMPINDEX(h / layer_config->skip_height, in_height);
846
0
                  const int jj =
847
0
                      CLAMPINDEX(w / layer_config->skip_width, in_width);
848
0
                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
849
0
                  sum += layer_config->weights[off] *
850
0
                         input[k][ii * in_stride + jj];
851
0
                }
852
0
              }
853
0
            }
854
0
            output[i][u * out_stride + v] = sum;
855
0
          }
856
0
        }
857
0
      }
858
0
      break;
859
0
    case PADDING_VALID:
860
0
      for (int i = 0; i < layer_config->out_channels; ++i) {
861
0
        for (int u = 0; u < out_height; ++u) {
862
0
          for (int v = 0; v < out_width; ++v) {
863
0
            float sum = layer_config->bias[i];
864
0
            for (int k = 0; k < layer_config->in_channels; ++k) {
865
0
              int off = k * layer_config->out_channels + i;
866
0
              for (int l = 0; l < layer_config->filter_height; ++l) {
867
0
                const int h = u - l;
868
0
                for (int m = 0; m < layer_config->filter_width;
869
0
                     ++m, off += cstep) {
870
0
                  const int w = v - m;
871
0
                  if ((h % layer_config->skip_height) != 0 ||
872
0
                      (w % layer_config->skip_width) != 0)
873
0
                    continue;
874
0
                  const int ii = h / layer_config->skip_height;
875
0
                  const int jj = w / layer_config->skip_width;
876
0
                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
877
0
                    continue;
878
0
                  sum += layer_config->weights[off] *
879
0
                         input[k][ii * in_stride + jj];
880
0
                }
881
0
              }
882
0
            }
883
0
            output[i][u * out_stride + v] = sum;
884
0
          }
885
0
        }
886
0
      }
887
0
      break;
888
0
    default: assert(0 && "Unknown padding type");
889
0
  }
890
0
}
891
892
void av1_cnn_predict_c(const float **input, int in_width, int in_height,
893
                       int in_stride, const CNN_CONFIG *cnn_config,
894
                       const CNN_THREAD_DATA *thread_data,
895
6.08k
                       CNN_MULTI_OUT *output_struct) {
896
6.08k
  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
897
6.08k
  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
898
899
6.08k
  float **output[CNN_MAX_BRANCHES];
900
6.08k
  const int *out_chs = output_struct->output_channels;
901
6.08k
  output[0] = output_struct->output_buffer;
902
24.3k
  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
903
18.2k
    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
904
18.2k
  }
905
906
6.08k
  int i_width = in_width;
907
6.08k
  int i_height = in_height;
908
6.08k
  int o_width = 0, o_height = 0;
909
30.3k
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
910
24.3k
    init_tensor(&tensor1[b]);
911
24.3k
    init_tensor(&tensor2[b]);
912
24.3k
  }
913
914
6.08k
  const int *out_stride = output_struct->output_strides;
915
36.4k
  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
916
30.4k
    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
917
30.4k
    const int branch = layer_config->branch;
918
30.4k
    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
919
920
    // Allocate input tensor
921
30.4k
    if (layer == 0) {       // First layer
922
6.08k
      assert(branch == 0);  // First layer must be primary branch
923
6.08k
      assign_tensor(&tensor1[branch], (float **)input,
924
6.08k
                    layer_config->in_channels, in_width, in_height, in_stride);
925
24.3k
    } else {  // Non-first layer
926
      // Swap tensor1 and tensor2
927
24.3k
      swap_tensor(&tensor1[branch], &tensor2[branch]);
928
929
24.3k
      i_width = tensor1[branch].width;
930
24.3k
      i_height = tensor1[branch].height;
931
24.3k
    }
932
933
    // Allocate output tensor
934
30.4k
    av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
935
30.4k
                                   &o_height);
936
30.4k
    const int output_num = layer_config->output_num;
937
30.4k
    if (output_num == -1) {  // Non-output layer
938
6.08k
      realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
939
6.08k
                     o_height);
940
24.3k
    } else {  // Output layer
941
24.3k
      free_tensor(&tensor2[branch]);
942
24.3k
      assign_tensor(&tensor2[branch], output[output_num],
943
24.3k
                    layer_config->out_channels, o_width, o_height,
944
24.3k
                    out_stride[output_num]);
945
24.3k
    }
946
947
    // If we are combining branches make sure that the branch to combine
948
    // is different from the current branch.
949
30.4k
    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
950
30.4k
                   !(branch_config->branches_to_combine & (1 << branch))));
951
952
30.4k
    if (layer_config->branch_copy_type == BRANCH_INPUT) {
953
0
      copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
954
0
                                     tensor2);
955
0
    }
956
    // Check consistency of input and output channels
957
30.4k
    assert(tensor1[branch].channels == layer_config->in_channels);
958
30.4k
    assert(tensor2[branch].channels == layer_config->out_channels);
959
960
    // Convolve/Deconvolve
961
30.4k
    if (!cnn_config->layer_config[layer].deconvolve) {
962
30.4k
      if (thread_data->num_workers > 1) {
963
0
        convolve_layer_mt((const float **)tensor1[branch].buf,
964
0
                          tensor1[branch].width, tensor1[branch].height,
965
0
                          tensor1[branch].stride, layer_config, thread_data,
966
0
                          tensor2[branch].buf, tensor2[branch].stride);
967
30.4k
      } else {
968
30.4k
        av1_cnn_convolve((const float **)tensor1[branch].buf,
969
30.4k
                         tensor1[branch].width, tensor1[branch].height,
970
30.4k
                         tensor1[branch].stride, layer_config,
971
30.4k
                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
972
30.4k
      }
973
18.4E
    } else {
974
18.4E
      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
975
18.4E
                         tensor1[branch].width, tensor1[branch].height,
976
18.4E
                         tensor1[branch].stride, layer_config,
977
18.4E
                         tensor2[branch].buf, tensor2[branch].stride);
978
18.4E
    }
979
980
30.4k
    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
981
0
      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
982
0
                                     tensor2);
983
0
    }
984
985
    // Add tensors from other branches if needed
986
30.4k
    if (layer_config->branch_combine_type == BRANCH_ADD) {
987
0
      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
988
0
        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
989
0
          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
990
0
          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
991
0
                      tensor2[branch].width, tensor2[branch].height,
992
0
                      tensor2[branch].stride, (const float **)tensor2[b].buf);
993
0
        }
994
0
      }
995
0
    }
996
997
    // Non-linearity
998
30.4k
    if (layer_config->activation != IDENTITY)
999
30.4k
      av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1000
30.4k
                       tensor2[branch].width, tensor2[branch].height,
1001
30.4k
                       tensor2[branch].stride, layer_config->activation);
1002
1003
30.4k
    if (layer_config->bn_params.bn_gamma) {
1004
0
      av1_cnn_batchnorm(
1005
0
          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1006
0
          tensor2[branch].height, tensor2[branch].stride,
1007
0
          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1008
0
          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1009
0
    }
1010
1011
    // Concatenate tensors
1012
30.4k
    if (layer_config->branch_combine_type == BRANCH_CAT) {
1013
0
      if (output_num == -1) {  // Non-output layer
1014
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1015
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1016
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1017
0
            assert(tensor2[b].channels > 0);
1018
0
            concat_tensor(&tensor2[b], &tensor2[branch]);
1019
0
          }
1020
0
        }
1021
0
      } else {  // Output layer
1022
0
        const int existing_channels = tensor2[branch].channels;
1023
0
        int num_chs = existing_channels;
1024
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1025
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1026
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1027
            // Needed only to assign the new channel buffers
1028
0
            num_chs += tensor2[b].channels;
1029
0
          }
1030
0
        }
1031
0
        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1032
0
                      o_height, out_stride[output_num]);
1033
1034
0
        num_chs = existing_channels;
1035
0
        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1036
0
          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1037
0
            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1038
            // Needed only to assign the new channel buffers
1039
0
            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1040
0
                        &tensor2[branch]);
1041
0
            num_chs += tensor2[b].channels;
1042
0
          }
1043
0
        }
1044
0
      }
1045
0
    }
1046
1047
30.4k
    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1048
0
      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
1049
0
                                     tensor2);
1050
0
    }
1051
30.4k
  }
1052
1053
30.4k
  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1054
24.3k
    free_tensor(&tensor1[b]);
1055
24.3k
    free_tensor(&tensor2[b]);
1056
24.3k
  }
1057
6.08k
}
1058
1059
// Assume output already has proper allocation
1060
// Assume input image buffers all have same resolution and strides
1061
void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1062
                                   int stride, const CNN_CONFIG *cnn_config,
1063
                                   const CNN_THREAD_DATA *thread_data,
1064
6.08k
                                   CNN_MULTI_OUT *output) {
1065
6.08k
  const float max_val = 255.0;
1066
1067
6.08k
  const int in_width = width + 2 * cnn_config->ext_width;
1068
6.08k
  const int in_height = height + 2 * cnn_config->ext_height;
1069
6.08k
  const int in_channels = cnn_config->layer_config[0].in_channels;
1070
6.08k
  float *inputs[CNN_MAX_CHANNELS];
1071
6.08k
  float *input_ =
1072
6.08k
      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1073
6.08k
  const int in_stride = in_width;
1074
1075
12.1k
  for (int c = 0; c < in_channels; ++c) {
1076
6.08k
    inputs[c] = input_ + c * in_stride * in_height;
1077
6.08k
    float *input =
1078
6.08k
        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1079
1080
6.08k
    if (cnn_config->strict_bounds) {
1081
0
      for (int i = 0; i < height; ++i)
1082
0
        for (int j = 0; j < width; ++j)
1083
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1084
      // extend left and right
1085
0
      for (int i = 0; i < height; ++i) {
1086
0
        for (int j = -cnn_config->ext_width; j < 0; ++j)
1087
0
          input[i * in_stride + j] = input[i * in_stride];
1088
0
        for (int j = width; j < width + cnn_config->ext_width; ++j)
1089
0
          input[i * in_stride + j] = input[i * in_stride + width - 1];
1090
0
      }
1091
      // extend top and bottom
1092
0
      for (int i = -cnn_config->ext_height; i < 0; ++i)
1093
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1094
0
               &input[-cnn_config->ext_width], in_width * sizeof(*input));
1095
0
      for (int i = height; i < height + cnn_config->ext_height; ++i)
1096
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1097
0
               &input[(height - 1) * in_stride - cnn_config->ext_width],
1098
0
               in_width * sizeof(*input));
1099
6.08k
    } else {
1100
400k
      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1101
394k
           ++i)
1102
25.9M
        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1103
25.5M
             ++j)
1104
25.5M
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1105
6.08k
    }
1106
6.08k
  }
1107
6.08k
  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1108
6.08k
                  cnn_config, thread_data, output);
1109
1110
6.08k
  aom_free(input_);
1111
6.08k
}
1112
1113
// Assume output already has proper allocation
1114
// Assume input image buffers all have same resolution and strides
1115
void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1116
                                          int stride,
1117
                                          const CNN_CONFIG *cnn_config,
1118
                                          const CNN_THREAD_DATA *thread_data,
1119
                                          int bit_depth,
1120
0
                                          CNN_MULTI_OUT *output) {
1121
0
  const float max_val = (float)((1 << bit_depth) - 1);
1122
1123
0
  const int in_width = width + 2 * cnn_config->ext_width;
1124
0
  const int in_height = height + 2 * cnn_config->ext_height;
1125
0
  const int in_channels = cnn_config->layer_config[0].in_channels;
1126
0
  float *inputs[CNN_MAX_CHANNELS];
1127
0
  float *input_ =
1128
0
      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1129
0
  const int in_stride = in_width;
1130
1131
0
  for (int c = 0; c < in_channels; ++c) {
1132
0
    inputs[c] = input_ + c * in_stride * in_height;
1133
0
    float *input =
1134
0
        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1135
1136
0
    if (cnn_config->strict_bounds) {
1137
0
      for (int i = 0; i < height; ++i)
1138
0
        for (int j = 0; j < width; ++j)
1139
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1140
      // extend left and right
1141
0
      for (int i = 0; i < height; ++i) {
1142
0
        for (int j = -cnn_config->ext_width; j < 0; ++j)
1143
0
          input[i * in_stride + j] = input[i * in_stride];
1144
0
        for (int j = width; j < width + cnn_config->ext_width; ++j)
1145
0
          input[i * in_stride + j] = input[i * in_stride + width - 1];
1146
0
      }
1147
      // extend top and bottom
1148
0
      for (int i = -cnn_config->ext_height; i < 0; ++i)
1149
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1150
0
               &input[-cnn_config->ext_width], in_width * sizeof(*input));
1151
0
      for (int i = height; i < height + cnn_config->ext_height; ++i)
1152
0
        memcpy(&input[i * in_stride - cnn_config->ext_width],
1153
0
               &input[(height - 1) * in_stride - cnn_config->ext_width],
1154
0
               in_width * sizeof(*input));
1155
0
    } else {
1156
0
      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1157
0
           ++i)
1158
0
        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1159
0
             ++j)
1160
0
          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1161
0
    }
1162
0
  }
1163
1164
0
  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1165
0
                  cnn_config, thread_data, output);
1166
1167
0
  aom_free(input_);
1168
0
}
1169
1170
// Assume output already has proper allocation
1171
// Assume input image buffers all have same resolution and strides
1172
void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
1173
                         const CNN_CONFIG *cnn_config,
1174
                         const CNN_THREAD_DATA *thread_data, float **output,
1175
0
                         int out_stride) {
1176
0
  int out_width = 0, out_height = 0, out_channels = 0;
1177
0
  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1178
0
                           &out_channels);
1179
0
  const int output_chs[1] = { out_channels };
1180
0
  const int output_strides[1] = { out_stride };
1181
0
  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1182
0
                                  .output_strides = output_strides,
1183
0
                                  .output_buffer = output };
1184
0
  av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
1185
0
                                thread_data, &output_struct);
1186
0
}
1187
1188
// Assume output already has proper allocation
1189
// Assume input image buffers all have same resolution and strides
1190
void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
1191
                                int stride, const CNN_CONFIG *cnn_config,
1192
                                const CNN_THREAD_DATA *thread_data,
1193
0
                                int bit_depth, float **output, int out_stride) {
1194
0
  int out_width = 0, out_height = 0, out_channels = 0;
1195
0
  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1196
0
                           &out_channels);
1197
0
  const int output_chs[1] = { out_channels };
1198
0
  const int output_strides[1] = { out_stride };
1199
0
  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1200
0
                                  .output_strides = output_strides,
1201
0
                                  .output_buffer = output };
1202
0
  av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
1203
0
                                       thread_data, bit_depth, &output_struct);
1204
0
}