/src/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017-2023 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | /* |
35 | | * This is an AVX512 implementation of the 32-way interleaved 16-bit rANS. |
36 | | * For now it only contains an order-0 implementation. The AVX2 code may |
37 | | * be used for order-1. |
38 | | */ |
39 | | |
40 | | #include "config.h" |
41 | | |
42 | | #if defined(__x86_64__) && defined(HAVE_AVX512) |
43 | | |
44 | | #include <stdint.h> |
45 | | #include <stdlib.h> |
46 | | #include <stdio.h> |
47 | | #include <string.h> |
48 | | #include <limits.h> |
49 | | #include <x86intrin.h> |
50 | | |
51 | | #include "rANS_word.h" |
52 | | #include "rANS_static4x16.h" |
53 | | #define ROT32_SIMD |
54 | | #include "rANS_static16_int.h" |
55 | | #include "varint.h" |
56 | | #include "utils.h" |
57 | | |
58 | | #ifndef USE_GATHER |
59 | | |
60 | | // Speds with Downfall mitigation Off/On and Zen4. |
61 | | // <------ AVX512 ---------> <------- AVX2 --------> |
62 | | // -o4: IntelOff IntelOn AMDZen4 |
63 | | // gcc7 544/1673 562/1711 448/1818 649/1515 645/1525 875/1624 |
64 | | // gcc13 557/1672 576/1711 582/1761 630/1623 629/1652 866/1762 |
65 | | // clang10 541/1547 564/1630 807/1912 620/1456 637/1481 837/1606 |
66 | | // clang16 533/1431 555/1510 890/1611 629/1370 627/1406 996/1432 |
67 | | // |
68 | | // Zen4 encode is particularly slow with gcc, but AVX2 encode is |
69 | | // faster and we use that already. |
70 | 0 | static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) { |
71 | 0 | uint32_t *b = (uint32_t *)v; |
72 | |
|
73 | | #ifndef __clang__ |
74 | | volatile |
75 | | #endif |
76 | 0 | int c[16] __attribute__((aligned(32))); |
77 | | |
78 | | //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower |
79 | 0 | _mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx)); |
80 | 0 | _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1)); |
81 | |
|
82 | 0 | __m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1); |
83 | 0 | __m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1); |
84 | 0 | __m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1); |
85 | 0 | __m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1); |
86 | |
|
87 | 0 | __m128i x01 = _mm_unpacklo_epi64(x0, x1); |
88 | 0 | __m128i x23 = _mm_unpacklo_epi64(x2, x3); |
89 | 0 | __m256i y0 =_mm256_castsi128_si256(x01); |
90 | |
|
91 | 0 | __m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1); |
92 | 0 | __m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1); |
93 | 0 | __m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1); |
94 | 0 | __m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1); |
95 | |
|
96 | 0 | __m128i x45 = _mm_unpacklo_epi64(x4, x5); |
97 | 0 | __m128i x67 = _mm_unpacklo_epi64(x6, x7); |
98 | 0 | __m256i y1 =_mm256_castsi128_si256(x45); |
99 | |
|
100 | 0 | y0 = _mm256_inserti128_si256(y0, x23, 1); |
101 | 0 | y1 = _mm256_inserti128_si256(y1, x67, 1); |
102 | |
|
103 | 0 | return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1); |
104 | 0 | } |
105 | | |
106 | | // 32-bit indices, 8-bit quantities into 32-bit lanes |
107 | 0 | static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) { |
108 | 0 | uint8_t *b = (uint8_t *)v; |
109 | 0 | volatile int c[16] __attribute__((aligned(32))); |
110 | | |
111 | | //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower |
112 | 0 | _mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx)); |
113 | 0 | _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1)); |
114 | |
|
115 | 0 | return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]], |
116 | 0 | b[c[11]], b[c[10]], b[c[9]], b[c[8]], |
117 | 0 | b[c[7]], b[c[6]], b[c[5]], b[c[4]], |
118 | 0 | b[c[3]], b[c[2]], b[c[1]], b[c[0]]); |
119 | 0 | } |
120 | | |
121 | | #else |
122 | | // real gathers |
123 | | #define _mm512_i32gather_epi32x _mm512_i32gather_epi32 |
124 | | #endif |
125 | | |
126 | | unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in, |
127 | | unsigned int in_size, |
128 | | unsigned char *out, |
129 | 0 | unsigned int *out_size) { |
130 | 0 | unsigned char *cp, *out_end; |
131 | 0 | RansEncSymbol syms[256]; |
132 | 0 | RansState ransN[32] __attribute__((aligned(64))); |
133 | 0 | uint8_t* ptr; |
134 | 0 | uint32_t F[256+MAGIC] = {0}; |
135 | 0 | int i, j, tab_size = 0, x, z; |
136 | | // -20 for order/size/meta |
137 | 0 | uint32_t bound = rans_compress_bound_4x16(in_size,0)-20; |
138 | |
|
139 | 0 | if (!out) { |
140 | 0 | *out_size = bound; |
141 | 0 | out = malloc(*out_size); |
142 | 0 | } |
143 | 0 | if (!out || bound > *out_size) |
144 | 0 | return NULL; |
145 | | |
146 | | // If "out" isn't word aligned, tweak out_end/ptr to ensure it is. |
147 | | // We already added more round in bound to allow for this. |
148 | 0 | if (((size_t)out)&1) |
149 | 0 | bound--; |
150 | 0 | ptr = out_end = out + bound; |
151 | |
|
152 | 0 | if (in_size == 0) |
153 | 0 | goto empty; |
154 | | |
155 | | // Compute statistics |
156 | 0 | if (hist8(in, in_size, F) < 0) |
157 | 0 | return NULL; |
158 | | |
159 | | // Normalise so frequences sum to power of 2 |
160 | 0 | uint32_t fsum = in_size; |
161 | 0 | uint32_t max_val = round2(fsum); |
162 | 0 | if (max_val > TOTFREQ) |
163 | 0 | max_val = TOTFREQ; |
164 | |
|
165 | 0 | if (normalise_freq(F, fsum, max_val) < 0) |
166 | 0 | return NULL; |
167 | 0 | fsum=max_val; |
168 | |
|
169 | 0 | cp = out; |
170 | 0 | cp += encode_freq(cp, F); |
171 | 0 | tab_size = cp-out; |
172 | | //write(2, out+4, cp-(out+4)); |
173 | |
|
174 | 0 | if (normalise_freq(F, fsum, TOTFREQ) < 0) |
175 | 0 | return NULL; |
176 | | |
177 | | // Encode statistics and build lookup tables for SIMD encoding. |
178 | 0 | uint32_t SB[256], SA[256], SD[256], SC[256]; |
179 | 0 | for (x = j = 0; j < 256; j++) { |
180 | 0 | if (F[j]) { |
181 | 0 | RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); |
182 | 0 | SB[j] = syms[j].x_max; |
183 | 0 | SA[j] = syms[j].rcp_freq; |
184 | 0 | SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16); |
185 | 0 | SC[j] = syms[j].bias; |
186 | 0 | x += F[j]; |
187 | 0 | } |
188 | 0 | } |
189 | |
|
190 | 0 | for (z = 0; z < 32; z++) |
191 | 0 | RansEncInit(&ransN[z]); |
192 | |
|
193 | 0 | z = i = in_size&(32-1); |
194 | 0 | while (z-- > 0) |
195 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]); |
196 | |
|
197 | 0 | #define LOAD512(a,b) \ |
198 | 0 | __m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \ |
199 | 0 | __m512i a##2 = _mm512_load_si512((__m512i *)&b[16]); |
200 | |
|
201 | 0 | #define STORE512(a,b) \ |
202 | 0 | _mm512_store_si512((__m256i *)&b[0], a##1); \ |
203 | 0 | _mm512_store_si512((__m256i *)&b[16], a##2); |
204 | |
|
205 | 0 | LOAD512(Rv, ransN); |
206 | |
|
207 | 0 | uint16_t *ptr16 = (uint16_t *)ptr; |
208 | 0 | for (i=(in_size &~(32-1)); i>0; i-=32) { |
209 | 0 | uint8_t *c = &in[i-32]; |
210 | | |
211 | | // GATHER versions |
212 | | // Much faster now we have an efficient loadu mechanism in place, |
213 | | // BUT... |
214 | | // Try this for avx2 variant too? Better way to populate the mm256 |
215 | | // regs for mix of avx2 and avx512 opcodes. |
216 | 0 | __m256i c12 = _mm256_loadu_si256((__m256i const *)c); |
217 | 0 | __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0)); |
218 | 0 | __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1)); |
219 | 0 | #define SET512(a,b) \ |
220 | 0 | __m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \ |
221 | 0 | __m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4) |
222 | |
|
223 | 0 | SET512(xmax, SB); |
224 | |
|
225 | 0 | uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); |
226 | 0 | int pc1 = _mm_popcnt_u32(gt_mask1); |
227 | 0 | __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); |
228 | 0 | __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); |
229 | 0 | uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); |
230 | 0 | SET512(SDv, SD); |
231 | 0 | int pc2 = _mm_popcnt_u32(gt_mask2); |
232 | |
|
233 | 0 | Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); |
234 | 0 | Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); |
235 | |
|
236 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2); |
237 | 0 | ptr16 -= pc2; |
238 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1); |
239 | 0 | ptr16 -= pc1; |
240 | |
|
241 | 0 | SET512(rfv, SA); |
242 | 0 | Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16); |
243 | 0 | Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16); |
244 | | |
245 | | // interleaved form of this, helps on icc a bit |
246 | | //rfv1 = _mm512_mulhi_epu32(Rv1, rfv1); |
247 | | //rfv2 = _mm512_mulhi_epu32(Rv2, rfv2); |
248 | | |
249 | | // Alternatives here: |
250 | | // SHIFT right/left instead of AND: (very marginally slower) |
251 | | // rf1_hm = _mm512_and_epi32( |
252 | | // rf1_hm, _mm512_set1_epi64((uint64_t)0xffffffff00000000)); |
253 | | // vs |
254 | | // rf1_hm = _mm512_srli_epi64(rf1_hm, 32); |
255 | | // rf1_hm = _mm512_slli_epi64(rf1_hm, 32); |
256 | 0 | __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32), |
257 | 0 | _mm512_srli_epi64(rfv1, 32)); |
258 | 0 | __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32), |
259 | 0 | _mm512_srli_epi64(rfv2, 32)); |
260 | 0 | __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32); |
261 | 0 | __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32); |
262 | 0 | rf1_hm = _mm512_and_epi32(rf1_hm, |
263 | 0 | _mm512_set1_epi64((uint64_t)0xffffffff<<32)); |
264 | 0 | rf2_hm = _mm512_and_epi32(rf2_hm, |
265 | 0 | _mm512_set1_epi64((uint64_t)0xffffffff<<32)); |
266 | 0 | rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm); |
267 | 0 | rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm); |
268 | | |
269 | | // Or a pure masked blend approach, sadly slower. |
270 | | // rfv1 = _mm512_mask_blend_epi32(0x5555, |
271 | | // _mm512_mul_epu32( |
272 | | // _mm512_srli_epi64(Rv1, 32), |
273 | | // _mm512_srli_epi64(rfv1, 32)), |
274 | | // _mm512_srli_epi64( |
275 | | // _mm512_mul_epu32(Rv1, rfv1), |
276 | | // 32)); |
277 | | // rfv2 = _mm512_mask_blend_epi32(0xaaaa, |
278 | | // _mm512_srli_epi64( |
279 | | // _mm512_mul_epu32(Rv2, rfv2), |
280 | | // 32), |
281 | | // _mm512_mul_epu32( |
282 | | // _mm512_srli_epi64(Rv2, 32), |
283 | | // _mm512_srli_epi64(rfv2, 32))); |
284 | |
|
285 | 0 | SET512(biasv, SC); |
286 | 0 | __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16); |
287 | 0 | __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16); |
288 | |
|
289 | 0 | __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1); |
290 | 0 | __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2); |
291 | |
|
292 | 0 | qv1 = _mm512_mullo_epi32(qv1, |
293 | 0 | _mm512_and_si512(SDv1,_mm512_set1_epi32(0xffff))); |
294 | 0 | qv1 = _mm512_add_epi32(qv1, biasv1); |
295 | 0 | Rv1 = _mm512_add_epi32(Rv1, qv1); |
296 | |
|
297 | 0 | qv2 = _mm512_mullo_epi32(qv2, |
298 | 0 | _mm512_and_si512(SDv2, _mm512_set1_epi32(0xffff))); |
299 | 0 | qv2 = _mm512_add_epi32(qv2, biasv2); |
300 | 0 | Rv2 = _mm512_add_epi32(Rv2, qv2); |
301 | 0 | } |
302 | 0 | ptr = (uint8_t *)ptr16; |
303 | 0 | STORE512(Rv, ransN); |
304 | |
|
305 | 0 | for (z = 32-1; z >= 0; z--) |
306 | 0 | RansEncFlush(&ransN[z], &ptr); |
307 | |
|
308 | 0 | empty: |
309 | | // Finalise block size and return it |
310 | 0 | *out_size = (out_end - ptr) + tab_size; |
311 | |
|
312 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
313 | |
|
314 | 0 | return out; |
315 | 0 | } |
316 | | |
317 | | unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, |
318 | | unsigned int in_size, |
319 | | unsigned char *out, |
320 | 0 | unsigned int out_sz) { |
321 | 0 | if (in_size < 32*4) // 32-states at least |
322 | 0 | return NULL; |
323 | | |
324 | 0 | if (out_sz >= INT_MAX) |
325 | 0 | return NULL; // protect against some overflow cases |
326 | | |
327 | | /* Load in the static tables */ |
328 | 0 | unsigned char *cp = in, *out_free = NULL; |
329 | 0 | unsigned char *cp_end = in + in_size; |
330 | 0 | int i; |
331 | 0 | uint32_t s3[TOTFREQ] __attribute__((aligned(64))); // For TF_SHIFT <= 12 |
332 | |
|
333 | 0 | if (!out) |
334 | 0 | out_free = out = malloc(out_sz); |
335 | 0 | if (!out) |
336 | 0 | return NULL; |
337 | | |
338 | | // Precompute reverse lookup of frequency. |
339 | 0 | uint32_t F[256] = {0}, fsum; |
340 | 0 | int fsz = decode_freq(cp, cp_end, F, &fsum); |
341 | 0 | if (!fsz) |
342 | 0 | goto err; |
343 | 0 | cp += fsz; |
344 | |
|
345 | 0 | normalise_freq_shift(F, fsum, TOTFREQ); |
346 | | |
347 | | // Build symbols; fixme, do as part of decode, see the _d variant |
348 | 0 | if (rans_F_to_s3(F, TF_SHIFT, s3)) |
349 | 0 | goto err; |
350 | | |
351 | 0 | if (cp_end - cp < 32 * 4) |
352 | 0 | goto err; |
353 | | |
354 | 0 | int z; |
355 | 0 | RansState Rv[32] __attribute__((aligned(64))); |
356 | 0 | for (z = 0; z < 32; z++) { |
357 | 0 | RansDecInit(&Rv[z], &cp); |
358 | 0 | if (Rv[z] < RANS_BYTE_L) |
359 | 0 | goto err; |
360 | 0 | } |
361 | | |
362 | 0 | uint16_t *sp = (uint16_t *)cp; |
363 | |
|
364 | 0 | int out_end = (out_sz&~(32-1)); |
365 | 0 | const uint32_t mask = (1u << TF_SHIFT)-1; |
366 | |
|
367 | 0 | __m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes |
368 | 0 | __m512i R1 = _mm512_load_epi32(&Rv[0]); |
369 | 0 | __m512i R2 = _mm512_load_epi32(&Rv[16]); |
370 | | |
371 | | // Start of the first loop iteration, which we do move to the end of the |
372 | | // loop for the next cycle so we can remove some of the instr. latency. |
373 | 0 | __m512i masked1 = _mm512_and_epi32(R1, maskv); |
374 | 0 | __m512i masked2 = _mm512_and_epi32(R2, maskv); |
375 | 0 | __m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3)); |
376 | 0 | __m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3)); |
377 | |
|
378 | 0 | uint8_t overflow[64+64] = {0}; |
379 | 0 | for (i=0; i < out_end; i+=32) { |
380 | | //for (z = 0; z < 16; z++) { |
381 | | |
382 | | // Protect against running off the end of in buffer. |
383 | | // We copy it to a worst-case local buffer when near the end. |
384 | 0 | if ((uint8_t *)sp+64 > cp_end) { |
385 | 0 | memmove(overflow, sp, cp_end - (uint8_t *)sp); |
386 | 0 | sp = (uint16_t *)overflow; |
387 | 0 | cp_end = overflow + sizeof(overflow); |
388 | 0 | } |
389 | | |
390 | | //uint32_t S = s3[R[z] & mask]; |
391 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words |
392 | | |
393 | | //uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask; |
394 | 0 | __m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8); |
395 | 0 | __m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8); |
396 | 0 | __m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv); |
397 | 0 | __m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv); |
398 | | |
399 | | //R[z] = f * (R[z] >> TF_SHIFT) + b; |
400 | | // approx 10 cycle latency on mullo. |
401 | 0 | R1 = _mm512_add_epi32( |
402 | 0 | _mm512_mullo_epi32( |
403 | 0 | _mm512_srli_epi32(R1, TF_SHIFT), f1), b1); |
404 | 0 | __mmask16 renorm_mask1, renorm_mask2; |
405 | 0 | renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L)); |
406 | 0 | R2 = _mm512_add_epi32( |
407 | 0 | _mm512_mullo_epi32( |
408 | 0 | _mm512_srli_epi32(R2, TF_SHIFT), f2), b2); |
409 | | |
410 | | // renorm. this is the interesting part: |
411 | 0 | renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L)); |
412 | | // advance by however many words we actually read |
413 | 0 | sp += _mm_popcnt_u32(renorm_mask1); |
414 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256( |
415 | 0 | (const __m256i *)sp)); |
416 | | |
417 | | // select masked only |
418 | 0 | __m512i renorm_vals1, renorm_vals2; |
419 | |
|
420 | 0 | renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1); |
421 | 0 | renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2); |
422 | | |
423 | | // For start of next loop iteration. This has been moved here |
424 | | // (and duplicated to before the loop starts) so we can do something |
425 | | // with the latency period of gather, such as finishing up the |
426 | | // renorm offset and writing the results. |
427 | 0 | __m512i S1_ = S1; // temporary copy for use in out[]=S later |
428 | 0 | __m512i S2_ = S2; |
429 | |
|
430 | 0 | masked1 = _mm512_and_epi32(renorm_vals1, maskv); |
431 | 0 | S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3)); |
432 | 0 | masked2 = _mm512_and_epi32(renorm_vals2, maskv); |
433 | 0 | S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3)); |
434 | |
|
435 | 0 | R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16); |
436 | 0 | R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16); |
437 | |
|
438 | 0 | __m512i m16 = _mm512_set1_epi32(0xffff); |
439 | 0 | renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16); |
440 | 0 | renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16); |
441 | | |
442 | | // advance by however many words we actually read |
443 | 0 | sp += _mm_popcnt_u32(renorm_mask2); |
444 | |
|
445 | 0 | R1 = _mm512_add_epi32(R1, renorm_vals1); |
446 | 0 | R2 = _mm512_add_epi32(R2, renorm_vals2); |
447 | | |
448 | | //out[i+z] = S; |
449 | 0 | _mm_storeu_si128((__m128i *)(out+i), _mm512_cvtepi32_epi8(S1_)); |
450 | 0 | _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_)); |
451 | 0 | } |
452 | |
|
453 | 0 | _mm512_store_epi32(&Rv[ 0], R1); |
454 | 0 | _mm512_store_epi32(&Rv[16], R2); |
455 | |
|
456 | 0 | for (z = out_sz & (32-1); z-- > 0; ) |
457 | 0 | out[out_end + z] = s3[Rv[z] & mask]; |
458 | |
|
459 | 0 | return out; |
460 | | |
461 | 0 | err: |
462 | 0 | free(out_free); |
463 | 0 | return NULL; |
464 | 0 | } |
465 | | |
466 | | #define TBUF8 |
467 | | #ifdef TBUF8 |
468 | | // 15% quicker overall O1 decode now due to rot32_simd below. |
469 | | |
470 | | // NB: This uses AVX2 though and we could rewrite using AVX512 for |
471 | | // further speed gains. |
472 | | static inline void transpose_and_copy(uint8_t *out, int iN[32], |
473 | 0 | uint8_t t[32][32]) { |
474 | | // int z; |
475 | | // for (z = 0; z < 32; z++) { |
476 | | // int k; |
477 | | // for (k = 0; k < 32; k++) |
478 | | // out[iN[z]+k] = t[k][z]; |
479 | | // iN[z] += 32; |
480 | | // } |
481 | |
|
482 | 0 | rot32_simd(t, out, iN); |
483 | 0 | } |
484 | | |
485 | | #else |
486 | | // Implemented using AVX512 gathers. |
487 | | // This is faster than a naive scalar implementation, but doesn't beat the |
488 | | // AVX2 vectorised 32x32 transpose function. |
489 | | static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32], |
490 | | uint32_t t32[32][32]) { |
491 | | int z; |
492 | | // for (z = 0; z < 32; z++) { |
493 | | // int k; |
494 | | // for (k = 0; k < 32; k++) |
495 | | // out[iN[z]+k] = t32[k][z]; |
496 | | // iN[z] += 32; |
497 | | // } |
498 | | |
499 | | |
500 | | __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, |
501 | | 7, 6, 5, 4, 3, 2, 1, 0); |
502 | | v1 = _mm512_slli_epi32(v1, 5); |
503 | | |
504 | | for (z = 0; z < 32; z++) { |
505 | | __m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4); |
506 | | __m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4); |
507 | | _mm_storeu_si128((__m128i*)(&out[iN[z] ]), _mm512_cvtepi32_epi8(t1)); |
508 | | _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2)); |
509 | | iN[z] += 32; |
510 | | } |
511 | | } |
512 | | #endif // TBUF |
513 | | |
514 | | unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, |
515 | | unsigned int in_size, |
516 | | unsigned char *out, |
517 | 0 | unsigned int *out_size) { |
518 | 0 | unsigned char *cp, *out_end, *out_free = NULL; |
519 | 0 | unsigned int tab_size; |
520 | 0 | uint32_t bound = rans_compress_bound_4x16(in_size,1)-20; |
521 | 0 | int z; |
522 | 0 | RansState ransN[32] __attribute__((aligned(64))); |
523 | |
|
524 | 0 | if (in_size < 32) // force O0 instead |
525 | 0 | return NULL; |
526 | | |
527 | 0 | if (!out) { |
528 | 0 | *out_size = bound; |
529 | 0 | out = malloc(*out_size); |
530 | 0 | } |
531 | 0 | if (!out || bound > *out_size) |
532 | 0 | return NULL; |
533 | | |
534 | 0 | if (((size_t)out)&1) |
535 | 0 | bound--; |
536 | 0 | out_end = out + bound; |
537 | |
|
538 | 0 | RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms))); |
539 | 0 | if (!syms) { |
540 | 0 | free(out_free); |
541 | 0 | return NULL; |
542 | 0 | } |
543 | | |
544 | 0 | cp = out; |
545 | 0 | int shift = encode_freq1(in, in_size, 32, syms, &cp); |
546 | 0 | if (shift < 0) { |
547 | 0 | free(out_free); |
548 | 0 | htscodecs_tls_free(syms); |
549 | 0 | return NULL; |
550 | 0 | } |
551 | 0 | tab_size = cp - out; |
552 | |
|
553 | 0 | for (z = 0; z < 32; z++) |
554 | 0 | RansEncInit(&ransN[z]); |
555 | |
|
556 | 0 | uint8_t* ptr = out_end; |
557 | |
|
558 | 0 | int iN[32] __attribute__((aligned(64))); |
559 | 0 | int isz4 = in_size/32; |
560 | 0 | for (z = 0; z < 32; z++) |
561 | 0 | iN[z] = (z+1)*isz4-2; |
562 | |
|
563 | 0 | uint32_t lN[32] __attribute__((aligned(64))); |
564 | 0 | for (z = 0; z < 32; z++) |
565 | 0 | lN[z] = in[iN[z]+1]; |
566 | | |
567 | | // Deal with the remainder |
568 | 0 | z = 32-1; |
569 | 0 | lN[z] = in[in_size-1]; |
570 | 0 | for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) { |
571 | 0 | unsigned char c = in[iN[z]]; |
572 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]); |
573 | 0 | lN[z] = c; |
574 | 0 | } |
575 | |
|
576 | 0 | LOAD512(Rv, ransN); |
577 | |
|
578 | 0 | uint16_t *ptr16 = (uint16_t *)ptr; |
579 | 0 | LOAD512(iN, iN); |
580 | 0 | LOAD512(last, lN); |
581 | |
|
582 | 0 | __m512i c1 = _mm512_i32gather_epi32x1(iN1, in); |
583 | 0 | __m512i c2 = _mm512_i32gather_epi32x1(iN2, in); |
584 | | |
585 | | // We cache the next 64-bytes locally and transpose. |
586 | | // This means we can load 32 ints from t32[x] with load instructions |
587 | | // instead of gathers. The copy, transpose and expand is easier done |
588 | | // in scalar code. |
589 | 0 | #define BATCH 64 |
590 | 0 | uint8_t t32[BATCH][32] __attribute__((aligned(64))); |
591 | 0 | int next_batch; |
592 | 0 | if (iN[0] > BATCH) { |
593 | 0 | int i, j; |
594 | 0 | for (i = 0; i < BATCH; i++) |
595 | | // memcpy(c[i], &in[iN[i]-32], 32); fast mode |
596 | 0 | for (j = 0; j < 32; j++) |
597 | 0 | t32[BATCH-1-i][j] = in[iN[j]-1-i]; |
598 | 0 | next_batch = BATCH; |
599 | 0 | } else { |
600 | 0 | next_batch = -1; |
601 | 0 | } |
602 | |
|
603 | 0 | for (; iN[0] >= 0; iN[0]--) { |
604 | | // Note, consider doing the same approach for the AVX2 encoder. |
605 | | // Maybe we can also get gather working well there? |
606 | | // Gather here is still a major latency bottleneck, consuming |
607 | | // around 40% of CPU cycles overall. |
608 | | |
609 | | // FIXME: maybe we need to cope with in[31] read over-flow |
610 | | // on loop cycles 0, 1, 2 where gather reads 32-bits instead of |
611 | | // 8 bits. Use set instead there on c2? |
612 | | |
613 | | // index into syms[0][0] array, used for x_max, rcp_freq, and bias |
614 | 0 | __m512i vidx1 = _mm512_slli_epi32(c1, 8); |
615 | 0 | __m512i vidx2 = _mm512_slli_epi32(c2, 8); |
616 | 0 | vidx1 = _mm512_add_epi32(vidx1, last1); |
617 | 0 | vidx2 = _mm512_add_epi32(vidx2, last2); |
618 | 0 | vidx1 = _mm512_slli_epi32(vidx1, 2); |
619 | 0 | vidx2 = _mm512_slli_epi32(vidx2, 2); |
620 | | |
621 | | // ------------------------------------------------------------ |
622 | | // for (z = NX-1; z >= 0; z--) { |
623 | | // if (ransN[z] >= x_max[z]) { |
624 | | // *--ptr16 = ransN[z] & 0xffff; |
625 | | // ransN[z] >>= 16; |
626 | | // } |
627 | | // } |
628 | |
|
629 | 0 | #define SET512x(a,x) \ |
630 | 0 | __m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \ |
631 | 0 | __m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4) |
632 | | |
633 | | // Start of next loop, moved here to remove latency. |
634 | | // last[z] = c[z] |
635 | | // iN[z]-- |
636 | | // c[z] = in[iN[z]] |
637 | 0 | last1 = c1; |
638 | 0 | last2 = c2; |
639 | 0 | iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1)); |
640 | 0 | iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1)); |
641 | | |
642 | | // Code below is equivalent to this: |
643 | | // c1 = _mm512_i32gather_epi32(iN1, in, 1); |
644 | | // c2 = _mm512_i32gather_epi32(iN2, in, 1); |
645 | | |
646 | | // Better when we have a power of 2 |
647 | 0 | if (next_batch >= 0) { |
648 | 0 | if (--next_batch < 0 && iN[0] > BATCH) { |
649 | | // Load 32 BATCH blocks of data. |
650 | | // Executed once every BATCH cycles |
651 | 0 | int i, j; |
652 | 0 | uint8_t c[32][BATCH]; |
653 | 0 | iN[0] += BATCH; |
654 | 0 | for (j = 0; j < 32; j++) { |
655 | 0 | iN[j] -= BATCH; |
656 | 0 | memcpy(c[j], &in[iN[j]-BATCH], BATCH); |
657 | 0 | } |
658 | | // transpose matrix from 32xBATCH to BATCHx32 |
659 | 0 | for (j = 0; j < 32; j++) { |
660 | 0 | for (i = 0; i < BATCH; i+=16) { |
661 | 0 | int z; |
662 | 0 | for (z = 0; z < 16; z++) |
663 | 0 | t32[i+z][j] = c[j][i+z]; |
664 | 0 | } |
665 | 0 | } |
666 | 0 | next_batch = BATCH-1; |
667 | 0 | } |
668 | 0 | if (next_batch >= 0) { |
669 | | // Copy from our of our pre-loaded BATCHx32 tables |
670 | | // Executed every cycles |
671 | 0 | __m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]); |
672 | 0 | __m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]); |
673 | 0 | c1 = _mm512_cvtepu8_epi32(c1_); |
674 | 0 | c2 = _mm512_cvtepu8_epi32(c2_); |
675 | 0 | } |
676 | 0 | } |
677 | |
|
678 | 0 | if (next_batch < 0 && iN[0]) { |
679 | | // no pre-loaded data as within BATCHx32 of input end |
680 | 0 | c1 = _mm512_i32gather_epi32x1(iN1, in); |
681 | 0 | c2 = _mm512_i32gather_epi32x1(iN2, in); |
682 | | |
683 | | // Speeds up clang, even though not needed any more. |
684 | | // Harmless to leave here. |
685 | 0 | c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff)); |
686 | 0 | c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff)); |
687 | 0 | } |
688 | | // End of "equivalent to" code block |
689 | |
|
690 | 0 | SET512x(xmax, x_max); // high latency |
691 | |
|
692 | 0 | uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); |
693 | 0 | int pc1 = _mm_popcnt_u32(gt_mask1); |
694 | 0 | __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); |
695 | 0 | __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); |
696 | 0 | uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); |
697 | 0 | SET512x(SDv, cmpl_freq); // good |
698 | 0 | int pc2 = _mm_popcnt_u32(gt_mask2); |
699 | |
|
700 | 0 | Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); |
701 | 0 | Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); |
702 | |
|
703 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2); |
704 | 0 | ptr16 -= pc2; |
705 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1); |
706 | 0 | ptr16 -= pc1; |
707 | |
|
708 | 0 | Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16); |
709 | 0 | Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16); |
710 | | |
711 | | // ------------------------------------------------------------ |
712 | | // uint32_t q = (uint32_t) (((uint64_t)ransN[z] * rcp_freq[z]) |
713 | | // >> rcp_shift[z]); |
714 | | // ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z]; |
715 | 0 | SET512x(rfv, rcp_freq); // good-ish |
716 | |
|
717 | 0 | __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32), |
718 | 0 | _mm512_srli_epi64(rfv1, 32)); |
719 | 0 | __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32), |
720 | 0 | _mm512_srli_epi64(rfv2, 32)); |
721 | 0 | __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32); |
722 | 0 | __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32); |
723 | |
|
724 | 0 | const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000); |
725 | 0 | rf1_hm = _mm512_and_epi32(rf1_hm, top32); |
726 | 0 | rf2_hm = _mm512_and_epi32(rf2_hm, top32); |
727 | 0 | rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm); |
728 | 0 | rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm); |
729 | |
|
730 | 0 | SET512x(biasv, bias); // good |
731 | 0 | __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16); |
732 | 0 | __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16); |
733 | |
|
734 | 0 | shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32)); |
735 | 0 | shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32)); |
736 | |
|
737 | 0 | __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1); |
738 | 0 | __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2); |
739 | |
|
740 | 0 | const __m512i bot16 = _mm512_set1_epi32(0xffff); |
741 | 0 | qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16)); |
742 | 0 | qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16)); |
743 | |
|
744 | 0 | qv1 = _mm512_add_epi32(qv1, biasv1); |
745 | 0 | Rv1 = _mm512_add_epi32(Rv1, qv1); |
746 | |
|
747 | 0 | qv2 = _mm512_add_epi32(qv2, biasv2); |
748 | 0 | Rv2 = _mm512_add_epi32(Rv2, qv2); |
749 | 0 | } |
750 | |
|
751 | 0 | STORE512(Rv, ransN); |
752 | 0 | STORE512(last, lN); |
753 | |
|
754 | 0 | ptr = (uint8_t *)ptr16; |
755 | |
|
756 | 0 | for (z = 32-1; z>=0; z--) |
757 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); |
758 | |
|
759 | 0 | for (z = 32-1; z >= 0; z--) |
760 | 0 | RansEncFlush(&ransN[z], &ptr); |
761 | | |
762 | | // Finalise block size and return it |
763 | 0 | *out_size = (out_end - ptr) + tab_size; |
764 | | |
765 | | // cp = out; |
766 | | // *cp++ = (in_size>> 0) & 0xff; |
767 | | // *cp++ = (in_size>> 8) & 0xff; |
768 | | // *cp++ = (in_size>>16) & 0xff; |
769 | | // *cp++ = (in_size>>24) & 0xff; |
770 | |
|
771 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
772 | |
|
773 | 0 | htscodecs_tls_free(syms); |
774 | 0 | return out; |
775 | 0 | } |
776 | | |
777 | 0 | #define NX 32 |
778 | | unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, |
779 | | unsigned int in_size, |
780 | | unsigned char *out, |
781 | 0 | unsigned int out_sz) { |
782 | 0 | if (in_size < NX*4) // 4-states at least |
783 | 0 | return NULL; |
784 | | |
785 | 0 | if (out_sz >= INT_MAX) |
786 | 0 | return NULL; // protect against some overflow cases |
787 | | |
788 | | /* Load in the static tables */ |
789 | 0 | unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL; |
790 | 0 | unsigned char *c_freq = NULL; |
791 | |
|
792 | 0 | uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4); |
793 | 0 | if (!s3) |
794 | 0 | return NULL; |
795 | 0 | uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3; |
796 | |
|
797 | 0 | if (!out) |
798 | 0 | out_free = out = malloc(out_sz); |
799 | |
|
800 | 0 | if (!out) |
801 | 0 | goto err; |
802 | | |
803 | | //fprintf(stderr, "out_sz=%d\n", out_sz); |
804 | | |
805 | | // compressed header? If so uncompress it |
806 | 0 | unsigned char *tab_end = NULL; |
807 | 0 | unsigned char *c_freq_end = cp_end; |
808 | 0 | unsigned int shift = *cp >> 4; |
809 | 0 | if (*cp++ & 1) { |
810 | 0 | uint32_t u_freq_sz, c_freq_sz; |
811 | 0 | cp += var_get_u32(cp, cp_end, &u_freq_sz); |
812 | 0 | cp += var_get_u32(cp, cp_end, &c_freq_sz); |
813 | 0 | if (c_freq_sz > cp_end - cp) |
814 | 0 | goto err; |
815 | 0 | tab_end = cp + c_freq_sz; |
816 | 0 | if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, |
817 | 0 | u_freq_sz))) |
818 | 0 | goto err; |
819 | 0 | cp = c_freq; |
820 | 0 | c_freq_end = c_freq + u_freq_sz; |
821 | 0 | } |
822 | | |
823 | | // Decode order-0 symbol list; avoids needing in order-1 tables |
824 | 0 | cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL); |
825 | |
|
826 | 0 | if (tab_end) |
827 | 0 | cp = tab_end; |
828 | 0 | free(c_freq); |
829 | 0 | c_freq = NULL; |
830 | |
|
831 | 0 | if (cp_end - cp < NX * 4) |
832 | 0 | goto err; |
833 | | |
834 | 0 | RansState R[NX] __attribute__((aligned(64))); |
835 | 0 | uint8_t *ptr = cp, *ptr_end = in + in_size; |
836 | 0 | int z; |
837 | 0 | for (z = 0; z < NX; z++) { |
838 | 0 | RansDecInit(&R[z], &ptr); |
839 | 0 | if (R[z] < RANS_BYTE_L) |
840 | 0 | goto err; |
841 | 0 | } |
842 | | |
843 | 0 | int isz4 = out_sz/NX; |
844 | 0 | int iN[NX], lN[NX] __attribute__((aligned(64))) = {0}; |
845 | 0 | for (z = 0; z < NX; z++) |
846 | 0 | iN[z] = z*isz4; |
847 | |
|
848 | 0 | uint16_t *sp = (uint16_t *)ptr; |
849 | 0 | const uint32_t mask = (1u << shift)-1; |
850 | |
|
851 | 0 | __m512i _maskv = _mm512_set1_epi32(mask); |
852 | 0 | LOAD512(_Rv, R); |
853 | 0 | LOAD512(_Lv, lN); |
854 | |
|
855 | 0 | #ifdef TBUF8 |
856 | 0 | union { |
857 | 0 | unsigned char tbuf[32][32]; |
858 | 0 | uint64_t tbuf64[32][4]; |
859 | 0 | } u __attribute__((aligned(32))); |
860 | | #else |
861 | | uint32_t tbuf[32][32]; |
862 | | #endif |
863 | |
|
864 | 0 | unsigned int tidx = 0; |
865 | |
|
866 | 0 | if (shift == TF_SHIFT_O1) { |
867 | 0 | isz4 -= 64; |
868 | 0 | for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { |
869 | | // m[z] = R[z] & mask; |
870 | 0 | __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); |
871 | 0 | __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); |
872 | | |
873 | | // S[z] = s3[lN[z]][m[z]]; |
874 | 0 | _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1); |
875 | 0 | _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1); |
876 | |
|
877 | 0 | _masked1 = _mm512_add_epi32(_masked1, _Lv1); |
878 | 0 | _masked2 = _mm512_add_epi32(_masked2, _Lv2); |
879 | | |
880 | | // This is the biggest bottleneck |
881 | 0 | __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0], |
882 | 0 | sizeof(s3F[0][0])); |
883 | 0 | __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0], |
884 | 0 | sizeof(s3F[0][0])); |
885 | | |
886 | | // f[z] = S[z]>>(TF_SHIFT_O1+8); |
887 | 0 | __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8); |
888 | 0 | __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8); |
889 | | |
890 | | // b[z] = (S[z]>>8) & mask; |
891 | 0 | __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); |
892 | 0 | __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); |
893 | | |
894 | | // s[z] = S[z] & 0xff; |
895 | 0 | __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); |
896 | 0 | __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); |
897 | | |
898 | | // A maximum frequency of 4096 doesn't fit in our s3 array. |
899 | | // as it's 12 bit + 12 bit + 8 bit. It wraps around to zero. |
900 | | // (We don't have this issue for TOTFREQ_O1_FAST.) |
901 | | // |
902 | | // Solution 1 is to change to spec to forbid freq of 4096. |
903 | | // Easy hack is to add an extra symbol so it sums correctly. |
904 | | // => 572 MB/s on q40 (deskpro). |
905 | | // |
906 | | // Solution 2 implemented here is to look for the wrap around |
907 | | // and fix it. |
908 | | // => 556 MB/s on q40 |
909 | | // cope with max freq of 4096. Only 3% hit |
910 | 0 | __m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1); |
911 | 0 | __m512i zero = _mm512_setzero_si512(); |
912 | 0 | __mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero); |
913 | 0 | __mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero); |
914 | 0 | _fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq); |
915 | 0 | _fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq); |
916 | | |
917 | | // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; |
918 | 0 | _Rv1 = _mm512_add_epi32( |
919 | 0 | _mm512_mullo_epi32( |
920 | 0 | _mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1); |
921 | 0 | _Rv2 = _mm512_add_epi32( |
922 | 0 | _mm512_mullo_epi32( |
923 | 0 | _mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2); |
924 | | |
925 | | //for (z = 0; z < NX; z++) lN[z] = c[z]; |
926 | 0 | _Lv1 = _sv1; |
927 | 0 | _Lv2 = _sv2; |
928 | | |
929 | | // RansDecRenorm(&R[z], &ptr); |
930 | 0 | __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, |
931 | 0 | _mm512_set1_epi32(0x80000000)); |
932 | 0 | __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, |
933 | 0 | _mm512_set1_epi32(0x80000000)); |
934 | |
|
935 | 0 | int _imask1 =_mm512_cmpgt_epi32_mask |
936 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); |
937 | 0 | int _imask2 = _mm512_cmpgt_epi32_mask |
938 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); |
939 | |
|
940 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32 |
941 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
942 | 0 | sp += _mm_popcnt_u32(_imask1); |
943 | |
|
944 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32 |
945 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
946 | 0 | sp += _mm_popcnt_u32(_imask2); |
947 | |
|
948 | 0 | __m512i _renorm_vals1 = |
949 | 0 | _mm512_maskz_expand_epi32(_imask1, renorm_words1); |
950 | 0 | __m512i _renorm_vals2 = |
951 | 0 | _mm512_maskz_expand_epi32(_imask2, renorm_words2); |
952 | |
|
953 | 0 | _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); |
954 | 0 | _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); |
955 | |
|
956 | 0 | _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); |
957 | 0 | _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); |
958 | |
|
959 | 0 | #ifdef TBUF8 |
960 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), |
961 | 0 | _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? |
962 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), |
963 | 0 | _mm512_cvtepi32_epi8(_Sv2)); |
964 | | #else |
965 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); |
966 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); |
967 | | #endif |
968 | |
|
969 | 0 | iN[0]++; |
970 | 0 | if (++tidx == 32) { |
971 | 0 | iN[0]-=32; |
972 | | |
973 | | // We have tidx[x][y] which we want to store in |
974 | | // memory in out[y][z] instead. This is an unrolled |
975 | | // transposition. |
976 | 0 | #ifdef TBUF8 |
977 | 0 | transpose_and_copy(out, iN, u.tbuf); |
978 | | #else |
979 | | transpose_and_copy_avx512(out, iN, tbuf); |
980 | | #endif |
981 | 0 | tidx = 0; |
982 | 0 | } |
983 | 0 | } |
984 | 0 | isz4 += 64; |
985 | |
|
986 | 0 | STORE512(_Rv, R); |
987 | 0 | STORE512(_Lv, lN); |
988 | 0 | ptr = (uint8_t *)sp; |
989 | |
|
990 | 0 | if (1) { |
991 | 0 | iN[0]-=tidx; |
992 | 0 | int T; |
993 | 0 | for (z = 0; z < NX; z++) |
994 | 0 | for (T = 0; T < tidx; T++) |
995 | 0 | #ifdef TBUF8 |
996 | 0 | out[iN[z]++] = u.tbuf[T][z]; |
997 | | #else |
998 | | out[iN[z]++] = tbuf[T][z]; |
999 | | #endif |
1000 | 0 | } |
1001 | | |
1002 | | // Scalar version for close to the end of in[] array so we don't |
1003 | | // do SIMD loads beyond the end of the buffer |
1004 | 0 | for (; iN[0] < isz4;) { |
1005 | 0 | for (z = 0; z < NX; z++) { |
1006 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1); |
1007 | 0 | uint32_t S = s3[lN[z]][m]; |
1008 | 0 | unsigned char c = S & 0xff; |
1009 | 0 | out[iN[z]++] = c; |
1010 | 0 | uint32_t F = S>>(TF_SHIFT_O1+8); |
1011 | 0 | R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + |
1012 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1)-1)); |
1013 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1014 | 0 | lN[z] = c; |
1015 | 0 | } |
1016 | 0 | } |
1017 | | |
1018 | | // Remainder |
1019 | 0 | z = NX-1; |
1020 | 0 | for (; iN[z] < out_sz; ) { |
1021 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1); |
1022 | 0 | uint32_t S = s3[lN[z]][m]; |
1023 | 0 | unsigned char c = S & 0xff; |
1024 | 0 | out[iN[z]++] = c; |
1025 | 0 | uint32_t F = S>>(TF_SHIFT_O1+8); |
1026 | 0 | R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + |
1027 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1)-1)); |
1028 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1029 | 0 | lN[z] = c; |
1030 | 0 | } |
1031 | 0 | } else { |
1032 | | // TF_SHIFT_O1_FAST. This is the most commonly used variant. |
1033 | | |
1034 | | // SIMD version ends decoding early as it reads at most 64 bytes |
1035 | | // from input via 4 vectorised loads. |
1036 | 0 | isz4 -= 64; |
1037 | 0 | for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { |
1038 | | // m[z] = R[z] & mask; |
1039 | 0 | __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); |
1040 | 0 | __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); |
1041 | | |
1042 | | // S[z] = s3[lN[z]][m[z]]; |
1043 | 0 | _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1_FAST); |
1044 | 0 | _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1_FAST); |
1045 | |
|
1046 | 0 | _masked1 = _mm512_add_epi32(_masked1, _Lv1); |
1047 | 0 | _masked2 = _mm512_add_epi32(_masked2, _Lv2); |
1048 | | |
1049 | | // This is the biggest bottleneck |
1050 | 0 | __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0], |
1051 | 0 | sizeof(s3F[0][0])); |
1052 | 0 | __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0], |
1053 | 0 | sizeof(s3F[0][0])); |
1054 | | |
1055 | | // f[z] = S[z]>>(TF_SHIFT_O1+8); |
1056 | 0 | __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8); |
1057 | 0 | __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8); |
1058 | | |
1059 | | // b[z] = (S[z]>>8) & mask; |
1060 | 0 | __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); |
1061 | 0 | __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); |
1062 | | |
1063 | | // s[z] = S[z] & 0xff; |
1064 | 0 | __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); |
1065 | 0 | __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); |
1066 | | |
1067 | | // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; |
1068 | 0 | _Rv1 = _mm512_add_epi32( |
1069 | 0 | _mm512_mullo_epi32( |
1070 | 0 | _mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST), |
1071 | 0 | _fv1), _bv1); |
1072 | 0 | _Rv2 = _mm512_add_epi32( |
1073 | 0 | _mm512_mullo_epi32( |
1074 | 0 | _mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST), |
1075 | 0 | _fv2), _bv2); |
1076 | | |
1077 | | //for (z = 0; z < NX; z++) lN[z] = c[z]; |
1078 | 0 | _Lv1 = _sv1; |
1079 | 0 | _Lv2 = _sv2; |
1080 | | |
1081 | | // RansDecRenorm(&R[z], &ptr); |
1082 | 0 | __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, |
1083 | 0 | _mm512_set1_epi32(0x80000000)); |
1084 | 0 | __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, |
1085 | 0 | _mm512_set1_epi32(0x80000000)); |
1086 | |
|
1087 | 0 | int _imask1 =_mm512_cmpgt_epi32_mask |
1088 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); |
1089 | 0 | int _imask2 = _mm512_cmpgt_epi32_mask |
1090 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); |
1091 | |
|
1092 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32 |
1093 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
1094 | 0 | sp += _mm_popcnt_u32(_imask1); |
1095 | |
|
1096 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32 |
1097 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
1098 | 0 | sp += _mm_popcnt_u32(_imask2); |
1099 | |
|
1100 | 0 | __m512i _renorm_vals1 = |
1101 | 0 | _mm512_maskz_expand_epi32(_imask1, renorm_words1); |
1102 | 0 | __m512i _renorm_vals2 = |
1103 | 0 | _mm512_maskz_expand_epi32(_imask2, renorm_words2); |
1104 | |
|
1105 | 0 | _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); |
1106 | 0 | _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); |
1107 | |
|
1108 | 0 | _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); |
1109 | 0 | _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); |
1110 | |
|
1111 | 0 | #ifdef TBUF8 |
1112 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), |
1113 | 0 | _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? |
1114 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), |
1115 | 0 | _mm512_cvtepi32_epi8(_Sv2)); |
1116 | | #else |
1117 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); |
1118 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); |
1119 | | #endif |
1120 | |
|
1121 | 0 | iN[0]++; |
1122 | 0 | if (++tidx == 32) { |
1123 | 0 | iN[0]-=32; |
1124 | 0 | #ifdef TBUF8 |
1125 | 0 | transpose_and_copy(out, iN, u.tbuf); |
1126 | | #else |
1127 | | transpose_and_copy_avx512(out, iN, tbuf); |
1128 | | #endif |
1129 | 0 | tidx = 0; |
1130 | 0 | } |
1131 | 0 | } |
1132 | 0 | isz4 += 64; |
1133 | |
|
1134 | 0 | STORE512(_Rv, R); |
1135 | 0 | STORE512(_Lv, lN); |
1136 | 0 | ptr = (uint8_t *)sp; |
1137 | |
|
1138 | 0 | if (1) { |
1139 | 0 | iN[0]-=tidx; |
1140 | 0 | int T; |
1141 | 0 | for (z = 0; z < NX; z++) |
1142 | 0 | for (T = 0; T < tidx; T++) |
1143 | 0 | #ifdef TBUF8 |
1144 | 0 | out[iN[z]++] = u.tbuf[T][z]; |
1145 | | #else |
1146 | | out[iN[z]++] = tbuf[T][z]; |
1147 | | #endif |
1148 | 0 | } |
1149 | | |
1150 | | // Scalar version for close to the end of in[] array so we don't |
1151 | | // do SIMD loads beyond the end of the buffer |
1152 | 0 | for (; iN[0] < isz4;) { |
1153 | 0 | for (z = 0; z < NX; z++) { |
1154 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1); |
1155 | 0 | uint32_t S = s3F[lN[z]][m]; |
1156 | 0 | unsigned char c = S & 0xff; |
1157 | 0 | out[iN[z]++] = c; |
1158 | 0 | R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + |
1159 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
1160 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1161 | 0 | lN[z] = c; |
1162 | 0 | } |
1163 | 0 | } |
1164 | | |
1165 | | // Remainder |
1166 | 0 | z = NX-1; |
1167 | 0 | for (; iN[z] < out_sz; ) { |
1168 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1); |
1169 | 0 | uint32_t S = s3F[lN[z]][m]; |
1170 | 0 | unsigned char c = S & 0xff; |
1171 | 0 | out[iN[z]++] = c; |
1172 | 0 | R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + |
1173 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
1174 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1175 | 0 | lN[z] = c; |
1176 | 0 | } |
1177 | 0 | } |
1178 | |
|
1179 | 0 | htscodecs_tls_free(s3); |
1180 | 0 | return out; |
1181 | | |
1182 | 0 | err: |
1183 | 0 | htscodecs_tls_free(s3); |
1184 | 0 | free(out_free); |
1185 | 0 | free(c_freq); |
1186 | |
|
1187 | | return NULL; |
1188 | 0 | } |
1189 | | #else // HAVE_AVX512 |
1190 | | // Prevent "empty translation unit" errors when building without AVX512 |
1191 | | const char *rANS_static32x16pr_avx512_disabled = "No AVX512"; |
1192 | | #endif // HAVE_AVX512 |