Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/intrapred_sse4.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>  // SSE2
13
#include <smmintrin.h>  /* SSE4.1 */
14
15
#include "config/av1_rtcd.h"
16
#include "aom_dsp/x86/intrapred_x86.h"
17
#include "aom_dsp/x86/intrapred_utils.h"
18
#include "aom_dsp/x86/lpf_common_sse2.h"
19
20
// Low bit depth functions
21
static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
22
  { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
23
    { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
24
    { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
25
    { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
26
    { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
27
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
28
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
29
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
30
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
31
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
32
      0 },
33
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
34
      0 },
35
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
36
      0, 0 },
37
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
38
      0, 0, 0 },
39
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
40
      0xff, 0, 0, 0 },
41
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
42
      0xff, 0xff, 0, 0 },
43
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
44
      0xff, 0xff, 0xff, 0 },
45
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
46
      0xff, 0xff, 0xff, 0xff },
47
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
48
      0xff, 0xff, 0xff, 0xff },
49
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
50
      0xff, 0xff, 0xff, 0xff },
51
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
52
      0xff, 0xff, 0xff, 0xff },
53
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
54
      0xff, 0xff, 0xff, 0xff },
55
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56
      0xff, 0xff, 0xff, 0xff },
57
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58
      0xff, 0xff, 0xff, 0xff },
59
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60
      0xff, 0xff, 0xff, 0xff },
61
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62
      0xff, 0xff, 0xff, 0xff },
63
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64
      0xff, 0xff, 0xff, 0xff },
65
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66
      0xff, 0xff, 0xff, 0xff },
67
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68
      0xff, 0xff, 0xff, 0xff },
69
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70
      0xff, 0xff, 0xff, 0xff },
71
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72
      0xff, 0xff, 0xff, 0xff },
73
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74
      0xff, 0xff, 0xff, 0xff },
75
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76
      0xff, 0xff, 0xff, 0xff },
77
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78
      0xff, 0xff, 0xff, 0xff } },
79
  {
80
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
81
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
82
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
83
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
84
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
85
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
86
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
87
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
88
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
89
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
90
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
91
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
92
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
93
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
94
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
95
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
96
      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
97
      { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
98
      { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
99
      { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
100
      { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
101
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
102
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
103
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
104
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
105
        0 },
106
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
107
        0 },
108
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
109
        0, 0 },
110
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
111
        0, 0, 0 },
112
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
113
        0, 0, 0, 0 },
114
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115
        0xff, 0, 0, 0 },
116
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
117
        0xff, 0xff, 0, 0 },
118
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
119
        0xff, 0xff, 0xff, 0 },
120
      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
121
        0xff, 0xff, 0xff, 0xff },
122
  },
123
};
124
125
/* clang-format on */
126
static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
127
    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
128
0
    int dx) {
129
0
  const int frac_bits = 6 - upsample_above;
130
0
  const int max_base_x = ((W + H) - 1) << upsample_above;
131
132
0
  assert(dx > 0);
133
  // pre-filter above pixels
134
  // store in temp buffers:
135
  //   above[x] * 32 + 16
136
  //   above[x+1] - above[x]
137
  // final pixels will be calculated as:
138
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
139
0
  __m128i a0, a1, a32, a16;
140
0
  __m128i diff, c3f;
141
0
  __m128i a_mbase_x;
142
143
0
  a16 = _mm_set1_epi16(16);
144
0
  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
145
0
  c3f = _mm_set1_epi16(0x3f);
146
147
0
  int x = dx;
148
0
  for (int r = 0; r < W; r++) {
149
0
    __m128i b, res, res1, shift;
150
0
    __m128i a0_above, a1_above;
151
152
0
    int base = x >> frac_bits;
153
0
    int base_max_diff = (max_base_x - base) >> upsample_above;
154
0
    if (base_max_diff <= 0) {
155
0
      for (int i = r; i < W; ++i) {
156
0
        dst[i] = a_mbase_x;  // save 4 values
157
0
      }
158
0
      return;
159
0
    }
160
0
    if (base_max_diff > H) base_max_diff = H;
161
0
    a0_above = _mm_loadu_si128((__m128i *)(above + base));
162
0
    a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
163
164
0
    if (upsample_above) {
165
0
      a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
166
0
      a1_above = _mm_srli_si128(a0_above, 8);
167
168
0
      shift = _mm_srli_epi16(
169
0
          _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
170
0
          1);
171
0
    } else {
172
0
      shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
173
0
    }
174
    // lower half
175
0
    a0 = _mm_cvtepu8_epi16(a0_above);
176
0
    a1 = _mm_cvtepu8_epi16(a1_above);
177
178
0
    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
179
0
    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
180
0
    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
181
182
0
    b = _mm_mullo_epi16(diff, shift);
183
0
    res = _mm_add_epi16(a32, b);
184
0
    res = _mm_srli_epi16(res, 5);
185
186
    // uppar half
187
0
    a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
188
0
    a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
189
190
0
    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
191
0
    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
192
0
    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
193
194
0
    b = _mm_mullo_epi16(diff, shift);
195
0
    res1 = _mm_add_epi16(a32, b);
196
0
    res1 = _mm_srli_epi16(res1, 5);
197
198
0
    res = _mm_packus_epi16(res, res1);
199
200
0
    dst[r] =
201
0
        _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
202
0
    x += dx;
203
0
  }
204
0
}
205
206
static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
207
                                        const uint8_t *above,
208
0
                                        int upsample_above, int dx) {
209
0
  __m128i dstvec[16];
210
211
0
  dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
212
0
  for (int i = 0; i < N; i++) {
213
0
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
214
0
  }
215
0
}
216
217
static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
218
                                        const uint8_t *above,
219
0
                                        int upsample_above, int dx) {
220
0
  __m128i dstvec[32];
221
222
0
  dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
223
0
  for (int i = 0; i < N; i++) {
224
0
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
225
0
  }
226
0
}
227
228
static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
229
                                         const uint8_t *above,
230
0
                                         int upsample_above, int dx) {
231
0
  __m128i dstvec[64];
232
233
0
  dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
234
0
                                       dx);
235
0
  for (int i = 0; i < N; i++) {
236
0
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
237
0
  }
238
0
}
239
240
static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
241
    int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
242
0
    int upsample_above, int dx) {
243
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
244
0
  (void)upsample_above;
245
0
  const int frac_bits = 6;
246
0
  const int max_base_x = ((32 + N) - 1);
247
248
  // pre-filter above pixels
249
  // store in temp buffers:
250
  //   above[x] * 32 + 16
251
  //   above[x+1] - above[x]
252
  // final pixels will be calculated as:
253
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
254
0
  __m128i a0, a1, a32, a16;
255
0
  __m128i a_mbase_x, diff, c3f;
256
257
0
  a16 = _mm_set1_epi16(16);
258
0
  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
259
0
  c3f = _mm_set1_epi16(0x3f);
260
261
0
  int x = dx;
262
0
  for (int r = 0; r < N; r++) {
263
0
    __m128i b, res, res1, res16[2];
264
0
    __m128i a0_above, a1_above;
265
266
0
    int base = x >> frac_bits;
267
0
    int base_max_diff = (max_base_x - base);
268
0
    if (base_max_diff <= 0) {
269
0
      for (int i = r; i < N; ++i) {
270
0
        dstvec[i] = a_mbase_x;  // save 32 values
271
0
        dstvec_h[i] = a_mbase_x;
272
0
      }
273
0
      return;
274
0
    }
275
0
    if (base_max_diff > 32) base_max_diff = 32;
276
0
    __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
277
278
0
    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
279
0
      int mdiff = base_max_diff - j;
280
0
      if (mdiff <= 0) {
281
0
        res16[jj] = a_mbase_x;
282
0
      } else {
283
0
        a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
284
0
        a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
285
286
        // lower half
287
0
        a0 = _mm_cvtepu8_epi16(a0_above);
288
0
        a1 = _mm_cvtepu8_epi16(a1_above);
289
290
0
        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
291
0
        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
292
0
        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
293
0
        b = _mm_mullo_epi16(diff, shift);
294
295
0
        res = _mm_add_epi16(a32, b);
296
0
        res = _mm_srli_epi16(res, 5);
297
298
        // uppar half
299
0
        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
300
0
        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
301
302
0
        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
303
0
        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
304
0
        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
305
306
0
        b = _mm_mullo_epi16(diff, shift);
307
0
        res1 = _mm_add_epi16(a32, b);
308
0
        res1 = _mm_srli_epi16(res1, 5);
309
310
0
        res16[jj] = _mm_packus_epi16(res, res1);  // 16 8bit values
311
0
      }
312
0
    }
313
314
0
    dstvec[r] =
315
0
        _mm_blendv_epi8(a_mbase_x, res16[0],
316
0
                        *(__m128i *)Mask[0][base_max_diff]);  // 16 8bit values
317
318
0
    dstvec_h[r] =
319
0
        _mm_blendv_epi8(a_mbase_x, res16[1],
320
0
                        *(__m128i *)Mask[1][base_max_diff]);  // 16 8bit values
321
0
    x += dx;
322
0
  }
323
0
}
324
325
static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
326
                                         const uint8_t *above,
327
0
                                         int upsample_above, int dx) {
328
0
  __m128i dstvec[64], dstvec_h[64];
329
0
  dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
330
0
                                        upsample_above, dx);
331
0
  for (int i = 0; i < N; i++) {
332
0
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
333
0
    _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
334
0
  }
335
0
}
336
337
static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
338
                                         const uint8_t *above,
339
0
                                         int upsample_above, int dx) {
340
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
341
0
  (void)upsample_above;
342
0
  const int frac_bits = 6;
343
0
  const int max_base_x = ((64 + N) - 1);
344
345
  // pre-filter above pixels
346
  // store in temp buffers:
347
  //   above[x] * 32 + 16
348
  //   above[x+1] - above[x]
349
  // final pixels will be calculated as:
350
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
351
0
  __m128i a0, a1, a32, a16;
352
0
  __m128i a_mbase_x, diff, c3f;
353
0
  __m128i max_base, base_inc, mask;
354
355
0
  a16 = _mm_set1_epi16(16);
356
0
  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
357
0
  max_base = _mm_set1_epi8(max_base_x);
358
0
  c3f = _mm_set1_epi16(0x3f);
359
360
0
  int x = dx;
361
0
  for (int r = 0; r < N; r++, dst += stride) {
362
0
    __m128i b, res, res1;
363
0
    int base = x >> frac_bits;
364
0
    if (base >= max_base_x) {
365
0
      for (int i = r; i < N; ++i) {
366
0
        _mm_storeu_si128((__m128i *)dst, a_mbase_x);  // save 32 values
367
0
        _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
368
0
        _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
369
0
        _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
370
0
        dst += stride;
371
0
      }
372
0
      return;
373
0
    }
374
375
0
    __m128i shift =
376
0
        _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);  // 8 element
377
378
0
    __m128i a0_above, a1_above, res_val;
379
0
    for (int j = 0; j < 64; j += 16) {
380
0
      int mdif = max_base_x - (base + j);
381
0
      if (mdif <= 0) {
382
0
        _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
383
0
      } else {
384
0
        a0_above =
385
0
            _mm_loadu_si128((__m128i *)(above + base + j));  // load 16 element
386
0
        a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
387
388
        // lower half
389
0
        a0 = _mm_cvtepu8_epi16(a0_above);
390
0
        a1 = _mm_cvtepu8_epi16(a1_above);
391
392
0
        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
393
0
        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
394
0
        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
395
0
        b = _mm_mullo_epi16(diff, shift);
396
397
0
        res = _mm_add_epi16(a32, b);
398
0
        res = _mm_srli_epi16(res, 5);
399
400
        // uppar half
401
0
        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
402
0
        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
403
404
0
        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
405
0
        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
406
0
        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
407
408
0
        b = _mm_mullo_epi16(diff, shift);
409
0
        res1 = _mm_add_epi16(a32, b);
410
0
        res1 = _mm_srli_epi16(res1, 5);
411
412
0
        res = _mm_packus_epi16(res, res1);  // 16 8bit values
413
414
0
        base_inc =
415
0
            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
416
0
                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
417
0
                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
418
0
                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
419
0
                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
420
0
                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
421
0
                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
422
0
                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
423
424
0
        mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
425
0
                              _mm_setzero_si128());
426
0
        res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
427
0
        _mm_storeu_si128((__m128i *)(dst + j), res_val);
428
0
      }
429
0
    }
430
0
    x += dx;
431
0
  }
432
0
}
433
434
// Directional prediction, zone 1: 0 < angle < 90
435
void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
436
                                 const uint8_t *above, const uint8_t *left,
437
0
                                 int upsample_above, int dx, int dy) {
438
0
  (void)left;
439
0
  (void)dy;
440
0
  switch (bw) {
441
0
    case 4:
442
0
      dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
443
0
      break;
444
0
    case 8:
445
0
      dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
446
0
      break;
447
0
    case 16:
448
0
      dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
449
0
      break;
450
0
    case 32:
451
0
      dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
452
0
      break;
453
0
    case 64:
454
0
      dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
455
0
      break;
456
0
    default: assert(0 && "Invalid block size");
457
0
  }
458
0
  return;
459
0
}
460
461
static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
462
                                        const uint8_t *above,
463
                                        const uint8_t *left, int upsample_above,
464
0
                                        int upsample_left, int dx, int dy) {
465
0
  const int min_base_x = -(1 << upsample_above);
466
0
  const int min_base_y = -(1 << upsample_left);
467
0
  const int frac_bits_x = 6 - upsample_above;
468
0
  const int frac_bits_y = 6 - upsample_left;
469
470
0
  assert(dx > 0);
471
  // pre-filter above pixels
472
  // store in temp buffers:
473
  //   above[x] * 32 + 16
474
  //   above[x+1] - above[x]
475
  // final pixels will be calculated as:
476
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
477
0
  __m128i a0_x, a1_x, a32, diff;
478
479
0
  const __m128i c3f = _mm_set1_epi16(0x3f);
480
0
  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
481
0
  const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
482
0
  const __m128i dy_reg = _mm_set1_epi16(dy);
483
0
  const __m128i a16 = _mm_set1_epi16(16);
484
485
0
  for (int r = 0; r < N; r++) {
486
0
    __m128i b, res, shift, r6, ydx;
487
0
    __m128i resx, resy, resxy;
488
0
    __m128i a0_above, a1_above;
489
0
    int y = r + 1;
490
0
    int base_x = (-y * dx) >> frac_bits_x;
491
0
    int base_shift = 0;
492
0
    if (base_x < (min_base_x - 1)) {
493
0
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
494
0
    }
495
0
    int base_min_diff =
496
0
        (min_base_x - base_x + upsample_above) >> upsample_above;
497
0
    if (base_min_diff > 4) {
498
0
      base_min_diff = 4;
499
0
    } else {
500
0
      if (base_min_diff < 0) base_min_diff = 0;
501
0
    }
502
503
0
    if (base_shift > 3) {
504
0
      a0_x = _mm_setzero_si128();
505
0
      a1_x = _mm_setzero_si128();
506
0
      shift = _mm_setzero_si128();
507
0
    } else {
508
0
      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
509
0
      ydx = _mm_set1_epi16(y * dx);
510
0
      r6 = _mm_slli_epi16(c1234, 6);
511
512
0
      if (upsample_above) {
513
0
        a0_above =
514
0
            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
515
0
        a1_above = _mm_srli_si128(a0_above, 8);
516
517
0
        shift = _mm_srli_epi16(
518
0
            _mm_and_si128(
519
0
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
520
0
            1);
521
0
      } else {
522
0
        a0_above =
523
0
            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
524
0
        a1_above = _mm_srli_si128(a0_above, 1);
525
526
0
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
527
0
      }
528
0
      a0_x = _mm_cvtepu8_epi16(a0_above);
529
0
      a1_x = _mm_cvtepu8_epi16(a1_above);
530
0
    }
531
    // y calc
532
0
    __m128i a0_y, a1_y, shifty;
533
0
    if (base_x < min_base_x) {
534
0
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
535
0
      __m128i y_c, base_y_c_reg, mask, c1234_;
536
0
      c1234_ = _mm_srli_si128(c1234, 2);
537
0
      r6 = _mm_set1_epi16(r << 6);
538
0
      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
539
0
      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
540
0
      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
541
0
      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
542
0
      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
543
544
0
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
545
0
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
546
0
      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
547
0
      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
548
0
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
549
0
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
550
551
0
      if (upsample_left) {
552
0
        shifty = _mm_srli_epi16(
553
0
            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
554
0
      } else {
555
0
        shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
556
0
      }
557
0
      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
558
0
      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
559
0
      shift = _mm_unpacklo_epi64(shift, shifty);
560
0
    }
561
562
0
    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
563
0
    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
564
0
    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
565
566
0
    b = _mm_mullo_epi16(diff, shift);
567
0
    res = _mm_add_epi16(a32, b);
568
0
    res = _mm_srli_epi16(res, 5);
569
570
0
    resx = _mm_packus_epi16(res, res);
571
0
    resy = _mm_srli_si128(resx, 4);
572
573
0
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
574
0
    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
575
0
    dst += stride;
576
0
  }
577
0
}
578
579
static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
580
                                        const uint8_t *above,
581
                                        const uint8_t *left, int upsample_above,
582
0
                                        int upsample_left, int dx, int dy) {
583
0
  const int min_base_x = -(1 << upsample_above);
584
0
  const int min_base_y = -(1 << upsample_left);
585
0
  const int frac_bits_x = 6 - upsample_above;
586
0
  const int frac_bits_y = 6 - upsample_left;
587
588
  // pre-filter above pixels
589
  // store in temp buffers:
590
  //   above[x] * 32 + 16
591
  //   above[x+1] - above[x]
592
  // final pixels will be calculated as:
593
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
594
0
  __m128i diff, a32;
595
0
  __m128i a0_x, a1_x, a0_y, a1_y;
596
0
  __m128i a0_above, a1_above;
597
598
0
  const __m128i a16 = _mm_set1_epi16(16);
599
0
  const __m128i c3f = _mm_set1_epi16(0x3f);
600
0
  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
601
0
  const __m128i dy_reg = _mm_set1_epi16(dy);
602
0
  const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
603
604
0
  for (int r = 0; r < N; r++) {
605
0
    __m128i b, res, res1, shift;
606
0
    __m128i resx, resy, resxy, r6, ydx;
607
608
0
    int y = r + 1;
609
0
    int base_x = (-y * dx) >> frac_bits_x;
610
0
    int base_shift = 0;
611
0
    if (base_x < (min_base_x - 1)) {
612
0
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
613
0
    }
614
0
    int base_min_diff =
615
0
        (min_base_x - base_x + upsample_above) >> upsample_above;
616
0
    if (base_min_diff > 8) {
617
0
      base_min_diff = 8;
618
0
    } else {
619
0
      if (base_min_diff < 0) base_min_diff = 0;
620
0
    }
621
622
0
    if (base_shift > 7) {
623
0
      resx = _mm_setzero_si128();
624
0
    } else {
625
0
      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
626
0
      ydx = _mm_set1_epi16(y * dx);
627
0
      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
628
0
      if (upsample_above) {
629
0
        a0_above =
630
0
            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
631
0
        a1_above = _mm_srli_si128(a0_above, 8);
632
633
0
        shift = _mm_srli_epi16(
634
0
            _mm_and_si128(
635
0
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
636
0
            1);
637
0
      } else {
638
0
        a1_above = _mm_srli_si128(a0_above, 1);
639
0
        a0_above =
640
0
            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
641
0
        a1_above =
642
0
            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
643
644
0
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
645
0
      }
646
0
      a0_x = _mm_cvtepu8_epi16(a0_above);
647
0
      a1_x = _mm_cvtepu8_epi16(a1_above);
648
649
0
      diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
650
0
      a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
651
0
      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
652
653
0
      b = _mm_mullo_epi16(diff, shift);
654
0
      res = _mm_add_epi16(a32, b);
655
0
      res = _mm_srli_epi16(res, 5);
656
0
      resx = _mm_packus_epi16(res, res);
657
0
    }
658
659
    // y calc
660
0
    if (base_x < min_base_x) {
661
0
      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
662
0
      __m128i y_c, base_y_c_reg, mask;
663
0
      r6 = _mm_set1_epi16(r << 6);
664
0
      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
665
0
      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
666
0
      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
667
0
      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
668
0
      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
669
670
0
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
671
0
                            left[base_y_c[2]], left[base_y_c[3]],
672
0
                            left[base_y_c[4]], left[base_y_c[5]],
673
0
                            left[base_y_c[6]], left[base_y_c[7]]);
674
0
      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
675
0
      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
676
677
0
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
678
0
                            left[base_y_c[2]], left[base_y_c[3]],
679
0
                            left[base_y_c[4]], left[base_y_c[5]],
680
0
                            left[base_y_c[6]], left[base_y_c[7]]);
681
682
0
      if (upsample_left) {
683
0
        shift = _mm_srli_epi16(
684
0
            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
685
0
      } else {
686
0
        shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
687
0
      }
688
689
0
      diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
690
0
      a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
691
0
      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
692
693
0
      b = _mm_mullo_epi16(diff, shift);
694
0
      res1 = _mm_add_epi16(a32, b);
695
0
      res1 = _mm_srli_epi16(res1, 5);
696
697
0
      resy = _mm_packus_epi16(res1, res1);
698
0
      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
699
0
      _mm_storel_epi64((__m128i *)dst, resxy);
700
0
    } else {
701
0
      _mm_storel_epi64((__m128i *)dst, resx);
702
0
    }
703
704
0
    dst += stride;
705
0
  }
706
0
}
707
708
static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
709
                                        ptrdiff_t stride, const uint8_t *above,
710
                                        const uint8_t *left, int upsample_above,
711
0
                                        int upsample_left, int dx, int dy) {
712
  // here upsample_above and upsample_left are 0 by design of
713
  // av1_use_intra_edge_upsample
714
0
  const int min_base_x = -1;
715
0
  const int min_base_y = -1;
716
0
  (void)upsample_above;
717
0
  (void)upsample_left;
718
0
  const int frac_bits_x = 6;
719
0
  const int frac_bits_y = 6;
720
721
0
  __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
722
0
  __m128i diff, shifty, shifty_h;
723
0
  __m128i a0_above, a1_above;
724
725
0
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
726
0
  const __m128i a16 = _mm_set1_epi16(16);
727
0
  const __m128i c1 = _mm_srli_epi16(a16, 4);
728
0
  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
729
0
  const __m128i c3f = _mm_set1_epi16(0x3f);
730
0
  const __m128i dy256 = _mm_set1_epi16(dy);
731
0
  const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
732
0
  const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
733
0
  const __m128i c1234 = _mm_add_epi16(c0123, c1);
734
0
  const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
735
736
0
  for (int r = 0; r < H; r++) {
737
0
    __m128i b, res, res1, shift, reg_j, r6, ydx;
738
0
    __m128i resx, resy;
739
0
    __m128i resxy;
740
0
    int y = r + 1;
741
0
    ydx = _mm_set1_epi16((int16_t)(y * dx));
742
743
0
    int base_x = (-y * dx) >> frac_bits_x;
744
0
    for (int j = 0; j < W; j += 16) {
745
0
      reg_j = _mm_set1_epi16(j);
746
0
      int base_shift = 0;
747
0
      if ((base_x + j) < (min_base_x - 1)) {
748
0
        base_shift = (min_base_x - (base_x + j) - 1);
749
0
      }
750
0
      int base_min_diff = (min_base_x - base_x - j);
751
0
      if (base_min_diff > 16) {
752
0
        base_min_diff = 16;
753
0
      } else {
754
0
        if (base_min_diff < 0) base_min_diff = 0;
755
0
      }
756
757
0
      if (base_shift < 16) {
758
0
        a0_above =
759
0
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
760
0
        a1_above =
761
0
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
762
0
        a0_above =
763
0
            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
764
0
        a1_above =
765
0
            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
766
767
0
        a0_x = _mm_cvtepu8_epi16(a0_above);
768
0
        a1_x = _mm_cvtepu8_epi16(a1_above);
769
770
0
        r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
771
0
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
772
773
0
        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
774
0
        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
775
0
        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
776
777
0
        b = _mm_mullo_epi16(diff, shift);
778
0
        res = _mm_add_epi16(a32, b);
779
0
        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
780
781
0
        a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
782
0
        a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
783
784
0
        r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
785
0
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
786
787
0
        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
788
0
        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
789
0
        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
790
791
0
        b = _mm_mullo_epi16(diff, shift);
792
0
        res1 = _mm_add_epi16(a32, b);
793
0
        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
794
795
0
        resx = _mm_packus_epi16(res, res1);
796
0
      } else {
797
0
        resx = _mm_setzero_si128();
798
0
      }
799
800
      // y calc
801
0
      if (base_x < min_base_x) {
802
0
        __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
803
0
        __m128i mask, mask_h, mul16, mul16_h;
804
0
        r6 = _mm_set1_epi16(r << 6);
805
0
        c_reg = _mm_add_epi16(reg_j, c1234);
806
0
        c_reg_h = _mm_add_epi16(reg_j, c1234_h);
807
0
        mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
808
0
                              _mm_srli_epi16(min_y_base, 1));
809
0
        mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
810
0
                                _mm_srli_epi16(min_y_base, 1));
811
0
        y_reg = _mm_sub_epi16(r6, mul16);
812
0
        y_reg_h = _mm_sub_epi16(r6, mul16_h);
813
814
0
        base_y = _mm_srai_epi16(y_reg, frac_bits_y);
815
0
        base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
816
0
        mask = _mm_cmpgt_epi16(min_y_base, base_y);
817
0
        mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
818
819
0
        base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
820
0
        base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
821
0
        int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
822
0
        int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
823
0
        int16_t offset_diff = max_y - min_y;
824
825
0
        if (offset_diff < 16) {
826
0
          __m128i min_y_reg = _mm_set1_epi16(min_y);
827
828
0
          __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
829
0
          __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
830
0
          __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
831
832
0
          __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
833
0
          __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
834
0
          __m128i LoadMask =
835
0
              _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
836
837
0
          a0_mask = _mm_and_si128(a0_mask, LoadMask);
838
0
          a1_mask = _mm_and_si128(a1_mask, LoadMask);
839
840
0
          a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
841
0
          a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
842
0
          a0_y = _mm_cvtepu8_epi16(a0_mask);
843
0
          a1_y = _mm_cvtepu8_epi16(a1_mask);
844
0
          a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
845
0
          a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
846
0
        } else {
847
0
          base_y = _mm_andnot_si128(mask, base_y);
848
0
          base_y_h = _mm_andnot_si128(mask_h, base_y_h);
849
0
          _mm_store_si128((__m128i *)base_y_c, base_y);
850
0
          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
851
852
0
          a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
853
0
                                left[base_y_c[2]], left[base_y_c[3]],
854
0
                                left[base_y_c[4]], left[base_y_c[5]],
855
0
                                left[base_y_c[6]], left[base_y_c[7]]);
856
0
          a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
857
0
                                  left[base_y_c[10]], left[base_y_c[11]],
858
0
                                  left[base_y_c[12]], left[base_y_c[13]],
859
0
                                  left[base_y_c[14]], left[base_y_c[15]]);
860
0
          base_y = _mm_add_epi16(base_y, c1);
861
0
          base_y_h = _mm_add_epi16(base_y_h, c1);
862
0
          _mm_store_si128((__m128i *)base_y_c, base_y);
863
0
          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
864
865
0
          a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
866
0
                                left[base_y_c[2]], left[base_y_c[3]],
867
0
                                left[base_y_c[4]], left[base_y_c[5]],
868
0
                                left[base_y_c[6]], left[base_y_c[7]]);
869
0
          a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
870
0
                                  left[base_y_c[10]], left[base_y_c[11]],
871
0
                                  left[base_y_c[12]], left[base_y_c[13]],
872
0
                                  left[base_y_c[14]], left[base_y_c[15]]);
873
0
        }
874
0
        shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
875
0
        shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
876
877
0
        diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
878
0
        a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
879
0
        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
880
881
0
        b = _mm_mullo_epi16(diff, shifty);
882
0
        res = _mm_add_epi16(a32, b);
883
0
        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
884
885
0
        diff = _mm_sub_epi16(a1_y_h, a0_y_h);  // a[x+1] - a[x]
886
0
        a32 = _mm_slli_epi16(a0_y_h, 5);       // a[x] * 32
887
0
        a32 = _mm_add_epi16(a32, a16);         // a[x] * 32 + 16
888
889
0
        b = _mm_mullo_epi16(diff, shifty_h);
890
0
        res1 = _mm_add_epi16(a32, b);
891
0
        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
892
0
        resy = _mm_packus_epi16(res, res1);
893
0
      } else {
894
0
        resy = _mm_setzero_si128();
895
0
      }
896
0
      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
897
0
      _mm_storeu_si128((__m128i *)(dst + j), resxy);
898
0
    }  // for j
899
0
    dst += stride;
900
0
  }
901
0
}
902
903
// Directional prediction, zone 2: 90 < angle < 180
904
void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
905
                                 const uint8_t *above, const uint8_t *left,
906
                                 int upsample_above, int upsample_left, int dx,
907
0
                                 int dy) {
908
0
  assert(dx > 0);
909
0
  assert(dy > 0);
910
0
  switch (bw) {
911
0
    case 4:
912
0
      dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
913
0
                                  upsample_left, dx, dy);
914
0
      break;
915
0
    case 8:
916
0
      dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
917
0
                                  upsample_left, dx, dy);
918
0
      break;
919
0
    default:
920
0
      dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
921
0
                                  upsample_above, upsample_left, dx, dy);
922
0
  }
923
0
  return;
924
0
}
925
926
// z3 functions
927
static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
928
                                        const uint8_t *left, int upsample_left,
929
0
                                        int dy) {
930
0
  __m128i dstvec[4], d[4];
931
932
0
  dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
933
0
  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
934
0
                            &d[0], &d[1], &d[2], &d[3]);
935
936
0
  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
937
0
  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
938
0
  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
939
0
  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
940
0
  return;
941
0
}
942
943
static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
944
                                        const uint8_t *left, int upsample_left,
945
0
                                        int dy) {
946
0
  __m128i dstvec[8], d[8];
947
948
0
  dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
949
0
  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
950
0
                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
951
0
                    &d[3]);
952
953
0
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
954
0
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
955
0
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
956
0
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
957
0
  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
958
0
  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
959
0
  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
960
0
  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
961
0
}
962
963
static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
964
                                        const uint8_t *left, int upsample_left,
965
0
                                        int dy) {
966
0
  __m128i dstvec[4], d[8];
967
968
0
  dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
969
0
  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
970
0
                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
971
0
  for (int i = 0; i < 8; i++) {
972
0
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
973
0
  }
974
0
}
975
976
static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
977
                                        const uint8_t *left, int upsample_left,
978
0
                                        int dy) {
979
0
  __m128i dstvec[8], d[4];
980
981
0
  dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
982
0
  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
983
0
                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
984
0
                        &d[1], &d[2], &d[3]);
985
0
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
986
0
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
987
0
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
988
0
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
989
0
}
990
991
static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
992
                                         const uint8_t *left, int upsample_left,
993
0
                                         int dy) {
994
0
  __m128i dstvec[8], d[8];
995
996
0
  dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
997
0
  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
998
0
                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
999
0
                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
1000
0
  for (int i = 0; i < 8; i++) {
1001
0
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1002
0
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
1003
0
                     _mm_srli_si128(d[i], 8));
1004
0
  }
1005
0
}
1006
1007
static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1008
                                         const uint8_t *left, int upsample_left,
1009
0
                                         int dy) {
1010
0
  __m128i dstvec[16], d[16];
1011
1012
0
  dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
1013
0
  transpose16x8_8x16_sse2(
1014
0
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1015
0
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1016
0
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1017
0
      &d[3], &d[4], &d[5], &d[6], &d[7]);
1018
1019
0
  for (int i = 0; i < 8; i++) {
1020
0
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1021
0
  }
1022
0
}
1023
1024
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1025
static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1026
                                         const uint8_t *left, int upsample_left,
1027
0
                                         int dy) {
1028
0
  __m128i dstvec[4], d[16];
1029
1030
0
  dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
1031
0
  transpose4x16_sse2(dstvec, d);
1032
0
  for (int i = 0; i < 16; i++) {
1033
0
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
1034
0
  }
1035
0
}
1036
1037
static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
1038
                                         const uint8_t *left, int upsample_left,
1039
0
                                         int dy) {
1040
0
  __m128i dstvec[16], d[8];
1041
1042
0
  dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
1043
0
  for (int i = 4; i < 8; i++) {
1044
0
    d[i] = _mm_setzero_si128();
1045
0
  }
1046
0
  transpose16x8_8x16_sse2(
1047
0
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1048
0
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1049
0
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1050
0
      &d[3], &d[4], &d[5], &d[6], &d[7]);
1051
1052
0
  for (int i = 0; i < 4; i++) {
1053
0
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1054
0
  }
1055
0
}
1056
1057
static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1058
                                         const uint8_t *left, int upsample_left,
1059
0
                                         int dy) {
1060
0
  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1061
1062
0
  dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
1063
0
                                        upsample_left, dy);
1064
0
  for (int i = 8; i < 16; i++) {
1065
0
    dstvec[i] = _mm_setzero_si128();
1066
0
    dstvec_h[i] = _mm_setzero_si128();
1067
0
  }
1068
0
  transpose16x16_sse2(dstvec, d);
1069
0
  transpose16x16_sse2(dstvec_h, d_h);
1070
1071
0
  for (int i = 0; i < 16; i++) {
1072
0
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1073
0
  }
1074
0
  for (int i = 0; i < 16; i++) {
1075
0
    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
1076
0
  }
1077
0
}
1078
1079
static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1080
                                         const uint8_t *left, int upsample_left,
1081
0
                                         int dy) {
1082
0
  __m128i dstvec[32], d[16];
1083
1084
0
  dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
1085
1086
0
  transpose16x8_8x16_sse2(
1087
0
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1088
0
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1089
0
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1090
0
      &d[3], &d[4], &d[5], &d[6], &d[7]);
1091
0
  transpose16x8_8x16_sse2(
1092
0
      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
1093
0
      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
1094
0
      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
1095
0
      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
1096
0
      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
1097
0
      &d[6 + 8], &d[7 + 8]);
1098
1099
0
  for (int i = 0; i < 8; i++) {
1100
0
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1101
0
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
1102
0
  }
1103
0
}
1104
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1105
1106
static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1107
                                          const uint8_t *left,
1108
0
                                          int upsample_left, int dy) {
1109
0
  __m128i dstvec[16], d[16];
1110
1111
0
  dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
1112
0
  transpose16x16_sse2(dstvec, d);
1113
1114
0
  for (int i = 0; i < 16; i++) {
1115
0
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1116
0
  }
1117
0
}
1118
1119
static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1120
                                          const uint8_t *left,
1121
0
                                          int upsample_left, int dy) {
1122
0
  __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
1123
1124
0
  dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
1125
0
                                        upsample_left, dy);
1126
0
  transpose16x16_sse2(dstvec, d);
1127
0
  transpose16x16_sse2(dstvec_h, d_h);
1128
0
  transpose16x16_sse2(dstvec + 16, d + 16);
1129
0
  transpose16x16_sse2(dstvec_h + 16, d_h + 16);
1130
0
  for (int j = 0; j < 16; j++) {
1131
0
    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1132
0
    _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
1133
0
  }
1134
0
  for (int j = 0; j < 16; j++) {
1135
0
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1136
0
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
1137
0
  }
1138
0
}
1139
1140
static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1141
                                          const uint8_t *left,
1142
0
                                          int upsample_left, int dy) {
1143
0
  uint8_t dstT[64 * 64];
1144
0
  dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
1145
0
  transpose(dstT, 64, dst, stride, 64, 64);
1146
0
}
1147
1148
static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1149
                                          const uint8_t *left,
1150
0
                                          int upsample_left, int dy) {
1151
0
  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1152
1153
0
  dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
1154
0
                                        upsample_left, dy);
1155
0
  transpose16x16_sse2(dstvec, d);
1156
0
  transpose16x16_sse2(dstvec_h, d_h);
1157
  // store
1158
0
  for (int j = 0; j < 16; j++) {
1159
0
    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1160
0
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1161
0
  }
1162
0
}
1163
1164
static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1165
                                          const uint8_t *left,
1166
0
                                          int upsample_left, int dy) {
1167
0
  __m128i dstvec[32], d[16];
1168
1169
0
  dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
1170
0
  for (int i = 0; i < 32; i += 16) {
1171
0
    transpose16x16_sse2((dstvec + i), d);
1172
0
    for (int j = 0; j < 16; j++) {
1173
0
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1174
0
    }
1175
0
  }
1176
0
}
1177
1178
static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1179
                                          const uint8_t *left,
1180
0
                                          int upsample_left, int dy) {
1181
0
  uint8_t dstT[64 * 32];
1182
0
  dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
1183
0
  transpose(dstT, 64, dst, stride, 32, 64);
1184
0
}
1185
1186
static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1187
                                          const uint8_t *left,
1188
0
                                          int upsample_left, int dy) {
1189
0
  uint8_t dstT[32 * 64];
1190
0
  dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
1191
0
  transpose(dstT, 32, dst, stride, 64, 32);
1192
0
  return;
1193
0
}
1194
1195
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1196
static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1197
                                          const uint8_t *left,
1198
0
                                          int upsample_left, int dy) {
1199
0
  uint8_t dstT[64 * 16];
1200
0
  dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
1201
0
  transpose(dstT, 64, dst, stride, 16, 64);
1202
0
}
1203
1204
static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1205
                                          const uint8_t *left,
1206
0
                                          int upsample_left, int dy) {
1207
0
  __m128i dstvec[64], d[16];
1208
1209
0
  dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
1210
0
  for (int i = 0; i < 64; i += 16) {
1211
0
    transpose16x16_sse2(dstvec + i, d);
1212
0
    for (int j = 0; j < 16; j++) {
1213
0
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1214
0
    }
1215
0
  }
1216
0
}
1217
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1218
1219
void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1220
                                 const uint8_t *above, const uint8_t *left,
1221
0
                                 int upsample_left, int dx, int dy) {
1222
0
  (void)above;
1223
0
  (void)dx;
1224
0
  assert(dx == 1);
1225
0
  assert(dy > 0);
1226
1227
0
  if (bw == bh) {
1228
0
    switch (bw) {
1229
0
      case 4:
1230
0
        dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
1231
0
        break;
1232
0
      case 8:
1233
0
        dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
1234
0
        break;
1235
0
      case 16:
1236
0
        dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
1237
0
        break;
1238
0
      case 32:
1239
0
        dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
1240
0
        break;
1241
0
      case 64:
1242
0
        dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
1243
0
        break;
1244
0
      default: assert(0 && "Invalid block size");
1245
0
    }
1246
0
  } else {
1247
0
    if (bw < bh) {
1248
0
      if (bw + bw == bh) {
1249
0
        switch (bw) {
1250
0
          case 4:
1251
0
            dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
1252
0
            break;
1253
0
          case 8:
1254
0
            dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
1255
0
            break;
1256
0
          case 16:
1257
0
            dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
1258
0
            break;
1259
0
          case 32:
1260
0
            dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
1261
0
            break;
1262
0
          default: assert(0 && "Invalid block size");
1263
0
        }
1264
0
      } else {
1265
0
        switch (bw) {
1266
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1267
0
          case 4:
1268
0
            dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
1269
0
            break;
1270
0
          case 8:
1271
0
            dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
1272
0
            break;
1273
0
          case 16:
1274
0
            dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
1275
0
            break;
1276
0
          default: assert(0 && "Invalid block size");
1277
0
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1278
0
        }
1279
0
      }
1280
0
    } else {
1281
0
      if (bh + bh == bw) {
1282
0
        switch (bh) {
1283
0
          case 4:
1284
0
            dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
1285
0
            break;
1286
0
          case 8:
1287
0
            dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
1288
0
            break;
1289
0
          case 16:
1290
0
            dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
1291
0
            break;
1292
0
          case 32:
1293
0
            dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
1294
0
            break;
1295
0
          default: assert(0 && "Invalid block size");
1296
0
        }
1297
0
      } else {
1298
0
        switch (bh) {
1299
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1300
0
          case 4:
1301
0
            dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
1302
0
            break;
1303
0
          case 8:
1304
0
            dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
1305
0
            break;
1306
0
          case 16:
1307
0
            dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
1308
0
            break;
1309
0
          default: assert(0 && "Invalid block size");
1310
0
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1311
0
        }
1312
0
      }
1313
0
    }
1314
0
  }
1315
0
}