/src/ffmpeg/libavcodec/x86/mlpdsp_init.c
Line | Count | Source |
1 | | /* |
2 | | * MLP DSP functions x86-optimized |
3 | | * Copyright (c) 2009 Ramiro Polla |
4 | | * |
5 | | * This file is part of FFmpeg. |
6 | | * |
7 | | * FFmpeg is free software; you can redistribute it and/or |
8 | | * modify it under the terms of the GNU Lesser General Public |
9 | | * License as published by the Free Software Foundation; either |
10 | | * version 2.1 of the License, or (at your option) any later version. |
11 | | * |
12 | | * FFmpeg is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | | * Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public |
18 | | * License along with FFmpeg; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | | */ |
21 | | |
22 | | #include <stdint.h> |
23 | | #include "config.h" |
24 | | #include "libavutil/attributes.h" |
25 | | #include "libavutil/cpu.h" |
26 | | #include "libavutil/macros.h" |
27 | | #include "libavutil/x86/asm.h" |
28 | | #include "libavutil/x86/cpu.h" |
29 | | #include "libavcodec/mlpdsp.h" |
30 | | #include "libavcodec/mlp.h" |
31 | | |
32 | | #define REMATRIX_CHANNEL_FUNC(opt) \ |
33 | | void ff_mlp_rematrix_channel_##opt(int32_t *samples, \ |
34 | | const int32_t *coeffs, \ |
35 | | const uint8_t *bypassed_lsbs, \ |
36 | | const int8_t *noise_buffer, \ |
37 | | int index, \ |
38 | | unsigned int dest_ch, \ |
39 | | uint16_t blockpos, \ |
40 | | unsigned int maxchan, \ |
41 | | int matrix_noise_shift, \ |
42 | | int access_unit_size_pow2, \ |
43 | | int32_t mask); |
44 | | |
45 | | REMATRIX_CHANNEL_FUNC(sse4) |
46 | | REMATRIX_CHANNEL_FUNC(avx2_bmi2) |
47 | | |
48 | | #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS |
49 | | |
50 | | extern char ff_mlp_firorder_8; |
51 | | extern char ff_mlp_firorder_7; |
52 | | extern char ff_mlp_firorder_6; |
53 | | extern char ff_mlp_firorder_5; |
54 | | extern char ff_mlp_firorder_4; |
55 | | extern char ff_mlp_firorder_3; |
56 | | extern char ff_mlp_firorder_2; |
57 | | extern char ff_mlp_firorder_1; |
58 | | extern char ff_mlp_firorder_0; |
59 | | |
60 | | extern char ff_mlp_iirorder_4; |
61 | | extern char ff_mlp_iirorder_3; |
62 | | extern char ff_mlp_iirorder_2; |
63 | | extern char ff_mlp_iirorder_1; |
64 | | extern char ff_mlp_iirorder_0; |
65 | | |
66 | | static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, |
67 | | &ff_mlp_firorder_2, &ff_mlp_firorder_3, |
68 | | &ff_mlp_firorder_4, &ff_mlp_firorder_5, |
69 | | &ff_mlp_firorder_6, &ff_mlp_firorder_7, |
70 | | &ff_mlp_firorder_8 }; |
71 | | static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, |
72 | | &ff_mlp_iirorder_2, &ff_mlp_iirorder_3, |
73 | | &ff_mlp_iirorder_4 }; |
74 | | |
75 | | #if ARCH_X86_64 |
76 | | |
77 | | #define MLPMUL(label, offset, offs, offc) \ |
78 | | LABEL_MANGLE(label)": \n\t" \ |
79 | | "movslq "offset"+"offs"(%0), %%rax\n\t" \ |
80 | | "movslq "offset"+"offc"(%1), %%rdx\n\t" \ |
81 | | "imul %%rdx, %%rax\n\t" \ |
82 | | "add %%rax, %%rsi\n\t" |
83 | | |
84 | | #define FIRMULREG(label, offset, firc)\ |
85 | | LABEL_MANGLE(label)": \n\t" \ |
86 | | "movslq "#offset"(%0), %%rax\n\t" \ |
87 | | "imul %"#firc", %%rax\n\t" \ |
88 | | "add %%rax, %%rsi\n\t" |
89 | | |
90 | | #define CLEAR_ACCUM \ |
91 | | "xor %%rsi, %%rsi\n\t" |
92 | | |
93 | | #define SHIFT_ACCUM \ |
94 | | "shr %%cl, %%rsi\n\t" |
95 | | |
96 | | #define ACCUM "%%rdx" |
97 | | #define RESULT "%%rsi" |
98 | | #define RESULT32 "%%esi" |
99 | | |
100 | | #else /* if ARCH_X86_32 */ |
101 | | |
102 | | #define MLPMUL(label, offset, offs, offc) \ |
103 | | LABEL_MANGLE(label)": \n\t" \ |
104 | | "mov "offset"+"offs"(%0), %%eax\n\t" \ |
105 | | "imull "offset"+"offc"(%1) \n\t" \ |
106 | | "add %%eax , %%esi\n\t" \ |
107 | | "adc %%edx , %%ecx\n\t" |
108 | | |
109 | | #define FIRMULREG(label, offset, firc) \ |
110 | | MLPMUL(label, #offset, "0", "0") |
111 | | |
112 | | #define CLEAR_ACCUM \ |
113 | | "xor %%esi, %%esi\n\t" \ |
114 | | "xor %%ecx, %%ecx\n\t" |
115 | | |
116 | | #define SHIFT_ACCUM \ |
117 | | "mov %%ecx, %%edx\n\t" \ |
118 | | "mov %%esi, %%eax\n\t" \ |
119 | | "movzbl %7 , %%ecx\n\t" \ |
120 | | "shrd %%cl, %%edx, %%eax\n\t" \ |
121 | | |
122 | | #define ACCUM "%%edx" |
123 | | #define RESULT "%%eax" |
124 | | #define RESULT32 "%%eax" |
125 | | |
126 | | #endif /* !ARCH_X86_64 */ |
127 | | |
128 | | #define BINC AV_STRINGIFY(4* MAX_CHANNELS) |
129 | | #define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE)) |
130 | | #define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER) |
131 | | |
132 | | #define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0") |
133 | | #define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC) |
134 | | |
135 | | static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, |
136 | | int firorder, int iirorder, |
137 | | unsigned int filter_shift, int32_t mask, |
138 | | int blocksize, int32_t *sample_buffer) |
139 | 79.4k | { |
140 | 79.4k | const void *firjump = firtable[firorder]; |
141 | 79.4k | const void *iirjump = iirtable[iirorder]; |
142 | | |
143 | 79.4k | blocksize = -blocksize; |
144 | | |
145 | 79.4k | __asm__ volatile( |
146 | 79.4k | "1: \n\t" |
147 | 79.4k | CLEAR_ACCUM |
148 | 79.4k | "jmp *%5 \n\t" |
149 | 79.4k | FIRMUL (ff_mlp_firorder_8, 0x1c ) |
150 | 79.4k | FIRMUL (ff_mlp_firorder_7, 0x18 ) |
151 | 79.4k | FIRMUL (ff_mlp_firorder_6, 0x14 ) |
152 | 79.4k | FIRMUL (ff_mlp_firorder_5, 0x10 ) |
153 | 79.4k | FIRMUL (ff_mlp_firorder_4, 0x0c ) |
154 | 79.4k | FIRMUL (ff_mlp_firorder_3, 0x08 ) |
155 | 79.4k | FIRMUL (ff_mlp_firorder_2, 0x04 ) |
156 | 79.4k | FIRMULREG(ff_mlp_firorder_1, 0x00, 8) |
157 | 79.4k | LABEL_MANGLE(ff_mlp_firorder_0)":\n\t" |
158 | 79.4k | "jmp *%6 \n\t" |
159 | 79.4k | IIRMUL (ff_mlp_iirorder_4, 0x0c ) |
160 | 79.4k | IIRMUL (ff_mlp_iirorder_3, 0x08 ) |
161 | 79.4k | IIRMUL (ff_mlp_iirorder_2, 0x04 ) |
162 | 79.4k | IIRMUL (ff_mlp_iirorder_1, 0x00 ) |
163 | 79.4k | LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t" |
164 | 79.4k | SHIFT_ACCUM |
165 | 79.4k | "mov "RESULT" ,"ACCUM" \n\t" |
166 | 79.4k | "add (%2) ,"RESULT" \n\t" |
167 | 79.4k | "and %4 ,"RESULT" \n\t" |
168 | 79.4k | "sub $4 , %0 \n\t" |
169 | 79.4k | "mov "RESULT32", (%0) \n\t" |
170 | 79.4k | "mov "RESULT32", (%2) \n\t" |
171 | 79.4k | "add $"BINC" , %2 \n\t" |
172 | 79.4k | "sub "ACCUM" ,"RESULT" \n\t" |
173 | 79.4k | "mov "RESULT32","IOFFS"(%0) \n\t" |
174 | 79.4k | "incl %3 \n\t" |
175 | 79.4k | "js 1b \n\t" |
176 | 79.4k | : /* 0*/"+r"(state), |
177 | 79.4k | /* 1*/"+r"(coeff), |
178 | 79.4k | /* 2*/"+r"(sample_buffer), |
179 | 79.4k | #if ARCH_X86_64 |
180 | 79.4k | /* 3*/"+r"(blocksize) |
181 | 79.4k | : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump), |
182 | 79.4k | /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift) |
183 | 79.4k | , /* 8*/"r"((int64_t)coeff[0]) |
184 | 79.4k | : "rax", "rdx", "rsi" |
185 | | #else /* ARCH_X86_32 */ |
186 | | /* 3*/"+m"(blocksize) |
187 | | : /* 4*/"m"( mask), /* 5*/"m"(firjump), |
188 | | /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift) |
189 | | : "eax", "edx", "esi", "ecx" |
190 | | #endif /* !ARCH_X86_64 */ |
191 | 79.4k | ); |
192 | 79.4k | } |
193 | | |
194 | | #endif /* HAVE_7REGS && HAVE_INLINE_ASM */ |
195 | | |
196 | | av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) |
197 | 5.77k | { |
198 | 5.77k | int cpu_flags = av_get_cpu_flags(); |
199 | 5.77k | #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS |
200 | 5.77k | if (INLINE_MMX(cpu_flags)) |
201 | 2.99k | c->mlp_filter_channel = mlp_filter_channel_x86; |
202 | 5.77k | #endif |
203 | 5.77k | if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags)) |
204 | 2.99k | c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4; |
205 | 5.77k | if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2) |
206 | 2.99k | c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2; |
207 | 5.77k | } |