Coverage Report

Created: 2026-04-01 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xnnpack/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-fma3.c
Line
Count
Source
1
// clang-format off
2
// Auto-generated file. Do not edit!
3
//   Template: src/f32-dwconv/unipass-avx.c.in
4
//   Generator: tools/xngen
5
//
6
// Copyright 2019 Google LLC
7
//
8
// This source code is licensed under the BSD-style license found in the
9
// LICENSE file in the root directory of this source tree.
10
11
#include <assert.h>
12
#include <stddef.h>
13
#include <stdint.h>
14
15
#include <immintrin.h>
16
17
#include "src/xnnpack/common.h"
18
#include "src/xnnpack/microparams.h"
19
#include "src/xnnpack/dwconv.h"
20
21
22
void xnn_f32_dwconv_minmax_ukernel_3p16c__fma3(
23
    size_t channels,
24
    size_t output_width,
25
    const float** input,
26
    const float* weights,
27
    float* output,
28
    intptr_t input_stride,
29
    size_t output_increment,
30
    size_t input_offset,
31
    size_t input_pixel_stride,
32
    const float* zero,
33
    const struct xnn_f32_minmax_params* restrict params)
34
0
{
35
0
  assert(channels != 0);
36
0
  assert(output_width != 0);
37
38
0
  static const int32_t mask_table[16] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
39
40
0
  const __m256 vmin = _mm256_set1_ps(params->scalar.min);
41
0
  const __m256 vmax = _mm256_set1_ps(params->scalar.max);
42
0
  XNN_FORCE_REALIZATION(vmin);
43
0
  XNN_FORCE_REALIZATION(vmax);
44
0
  do {
45
0
    const float* i0 = input[0];
46
0
    assert(i0 != NULL);
47
0
    if XNN_UNPREDICTABLE(i0 != zero) {
48
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
49
0
    }
50
0
    const float* i1 = input[1];
51
0
    assert(i1 != NULL);
52
0
    if XNN_UNPREDICTABLE(i1 != zero) {
53
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
54
0
    }
55
0
    const float* i2 = input[2];
56
0
    assert(i2 != NULL);
57
0
    if XNN_UNPREDICTABLE(i2 != zero) {
58
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
59
0
    }
60
0
    input = (const float**) ((uintptr_t) input + input_stride);
61
62
0
    size_t c = channels;
63
0
    const float* w = weights;
64
0
    for (; c >= 16; c -= 16) {
65
0
      __m256 vacc01234567p0 = _mm256_load_ps(w);
66
0
      __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
67
68
69
0
      const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
70
0
      const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
71
0
      i0 += 16;
72
73
0
      const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
74
0
      const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
75
0
      vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
76
0
      vacc89ABCDEFp0 = _mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0);
77
78
0
      const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
79
0
      const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
80
0
      i1 += 16;
81
82
0
      const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
83
0
      const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
84
0
      vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
85
0
      vacc89ABCDEFp0 = _mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0);
86
87
0
      const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
88
0
      const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
89
0
      i2 += 16;
90
91
0
      const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
92
0
      const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
93
0
      vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
94
0
      vacc89ABCDEFp0 = _mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0);
95
96
0
      w += 64;
97
98
99
0
      __m256 vacc01234567 = _mm256_max_ps(vmin, vacc01234567p0);
100
0
      __m256 vacc89ABCDEF = _mm256_max_ps(vmin, vacc89ABCDEFp0);
101
0
      vacc01234567 = _mm256_min_ps(vmax, vacc01234567);
102
0
      vacc89ABCDEF = _mm256_min_ps(vmax, vacc89ABCDEF);
103
104
0
      _mm256_storeu_ps(output, vacc01234567);
105
0
      _mm256_storeu_ps(output + 8, vacc89ABCDEF);
106
0
      output += 16;
107
0
    }
108
0
    for (; c >= 8; c -= 8) {
109
0
      __m256 vacc01234567p0 = _mm256_load_ps(w);
110
111
0
      const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
112
0
      i0 += 8;
113
114
0
      const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
115
0
      vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
116
117
0
      const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
118
0
      i1 += 8;
119
120
0
      const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
121
0
      vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
122
123
0
      const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
124
0
      i2 += 8;
125
126
0
      const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
127
0
      vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
128
129
0
      w += 8;
130
131
132
0
      __m256 vacc01234567 = _mm256_max_ps(vmin, vacc01234567p0);
133
0
      vacc01234567 = _mm256_min_ps(vmax, vacc01234567);
134
135
0
      _mm256_storeu_ps(output, vacc01234567);
136
0
      output += 8;
137
0
    }
138
0
    if XNN_UNLIKELY(c != 0) {
139
0
      assert(c >= 1);
140
0
      assert(c <= 7);
141
0
      const __m256i vmask = _mm256_loadu_si256((const __m256i*) &mask_table[8 - c]);
142
143
0
      __m256 vacc01234567p0 = _mm256_load_ps(w);
144
145
0
      const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
146
0
      const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
147
0
      vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
148
149
0
      const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
150
0
      const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
151
0
      vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
152
153
0
      const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
154
0
      const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
155
0
      vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
156
157
158
0
      __m256 vacc01234567 = _mm256_max_ps(vmin, vacc01234567p0);
159
0
      vacc01234567 = _mm256_min_ps(vmax, vacc01234567);
160
161
0
      _mm256_maskstore_ps(output, vmask, vacc01234567);
162
0
      output += c;
163
0
    }
164
165
0
    input_offset += input_pixel_stride;
166
0
    output = (float*) ((uintptr_t) output + output_increment);
167
0
  } while (--output_width != 0);
168
0
}