/src/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017-2021 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | /* |
35 | | * This is an AVX512 implementation of the 32-way interleaved 16-bit rANS. |
36 | | * For now it only contains an order-0 implementation. The AVX2 code may |
37 | | * be used for order-1. |
38 | | */ |
39 | | |
40 | | #include "config.h" |
41 | | |
42 | | #ifdef HAVE_AVX512 |
43 | | |
44 | | #include <stdint.h> |
45 | | #include <stdlib.h> |
46 | | #include <stdio.h> |
47 | | #include <string.h> |
48 | | #include <limits.h> |
49 | | #include <x86intrin.h> |
50 | | |
51 | | #include "rANS_word.h" |
52 | | #include "rANS_static4x16.h" |
53 | | #define ROT32_SIMD |
54 | | #include "rANS_static16_int.h" |
55 | | #include "varint.h" |
56 | | #include "utils.h" |
57 | | |
58 | | unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in, |
59 | | unsigned int in_size, |
60 | | unsigned char *out, |
61 | 0 | unsigned int *out_size) { |
62 | 0 | unsigned char *cp, *out_end; |
63 | 0 | RansEncSymbol syms[256]; |
64 | 0 | RansState ransN[32] __attribute__((aligned(64))); |
65 | 0 | uint8_t* ptr; |
66 | 0 | uint32_t F[256+MAGIC] = {0}; |
67 | 0 | int i, j, tab_size = 0, x, z; |
68 | | // -20 for order/size/meta |
69 | 0 | uint32_t bound = rans_compress_bound_4x16(in_size,0)-20; |
70 | |
|
71 | 0 | if (!out) { |
72 | 0 | *out_size = bound; |
73 | 0 | out = malloc(*out_size); |
74 | 0 | } |
75 | 0 | if (!out || bound > *out_size) |
76 | 0 | return NULL; |
77 | | |
78 | | // If "out" isn't word aligned, tweak out_end/ptr to ensure it is. |
79 | | // We already added more round in bound to allow for this. |
80 | 0 | if (((size_t)out)&1) |
81 | 0 | bound--; |
82 | 0 | ptr = out_end = out + bound; |
83 | |
|
84 | 0 | if (in_size == 0) |
85 | 0 | goto empty; |
86 | | |
87 | | // Compute statistics |
88 | 0 | hist8(in, in_size, F); |
89 | | |
90 | | // Normalise so frequences sum to power of 2 |
91 | 0 | uint32_t fsum = in_size; |
92 | 0 | uint32_t max_val = round2(fsum); |
93 | 0 | if (max_val > TOTFREQ) |
94 | 0 | max_val = TOTFREQ; |
95 | |
|
96 | 0 | if (normalise_freq(F, fsum, max_val) < 0) |
97 | 0 | return NULL; |
98 | 0 | fsum=max_val; |
99 | |
|
100 | 0 | cp = out; |
101 | 0 | cp += encode_freq(cp, F); |
102 | 0 | tab_size = cp-out; |
103 | | //write(2, out+4, cp-(out+4)); |
104 | |
|
105 | 0 | if (normalise_freq(F, fsum, TOTFREQ) < 0) |
106 | 0 | return NULL; |
107 | | |
108 | | // Encode statistics and build lookup tables for SIMD encoding. |
109 | 0 | uint32_t SB[256], SA[256], SD[256], SC[256]; |
110 | 0 | for (x = j = 0; j < 256; j++) { |
111 | 0 | if (F[j]) { |
112 | 0 | RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); |
113 | 0 | SB[j] = syms[j].x_max; |
114 | 0 | SA[j] = syms[j].rcp_freq; |
115 | 0 | SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16); |
116 | 0 | SC[j] = syms[j].bias; |
117 | 0 | x += F[j]; |
118 | 0 | } |
119 | 0 | } |
120 | |
|
121 | 0 | for (z = 0; z < 32; z++) |
122 | 0 | RansEncInit(&ransN[z]); |
123 | |
|
124 | 0 | z = i = in_size&(32-1); |
125 | 0 | while (z-- > 0) |
126 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]); |
127 | |
|
128 | 0 | #define LOAD512(a,b) \ |
129 | 0 | __m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \ |
130 | 0 | __m512i a##2 = _mm512_load_si512((__m512i *)&b[16]); |
131 | |
|
132 | 0 | #define STORE512(a,b) \ |
133 | 0 | _mm512_store_si512((__m256i *)&b[0], a##1); \ |
134 | 0 | _mm512_store_si512((__m256i *)&b[16], a##2); |
135 | |
|
136 | 0 | LOAD512(Rv, ransN); |
137 | |
|
138 | 0 | uint16_t *ptr16 = (uint16_t *)ptr; |
139 | 0 | for (i=(in_size &~(32-1)); i>0; i-=32) { |
140 | 0 | uint8_t *c = &in[i-32]; |
141 | | |
142 | | // GATHER versions |
143 | | // Much faster now we have an efficient loadu mechanism in place, |
144 | | // BUT... |
145 | | // Try this for avx2 variant too? Better way to populate the mm256 |
146 | | // regs for mix of avx2 and avx512 opcodes. |
147 | 0 | __m256i c12 = _mm256_loadu_si256((__m256i const *)c); |
148 | 0 | __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0)); |
149 | 0 | __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1)); |
150 | 0 | #define SET512(a,b) \ |
151 | 0 | __m512i a##1 = _mm512_i32gather_epi32(c1, b, 4); \ |
152 | 0 | __m512i a##2 = _mm512_i32gather_epi32(c2, b, 4) |
153 | |
|
154 | 0 | SET512(xmax, SB); |
155 | |
|
156 | 0 | uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); |
157 | 0 | int pc1 = _mm_popcnt_u32(gt_mask1); |
158 | 0 | __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); |
159 | 0 | __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); |
160 | 0 | uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); |
161 | 0 | SET512(SDv, SD); |
162 | 0 | int pc2 = _mm_popcnt_u32(gt_mask2); |
163 | | |
164 | | //Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); |
165 | 0 | Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); |
166 | 0 | Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); |
167 | |
|
168 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2); |
169 | 0 | ptr16 -= pc2; |
170 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1); |
171 | 0 | ptr16 -= pc1; |
172 | |
|
173 | 0 | SET512(rfv, SA); |
174 | 0 | Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16); |
175 | 0 | Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16); |
176 | | |
177 | | // interleaved form of this, helps on icc a bit |
178 | | //rfv1 = _mm512_mulhi_epu32(Rv1, rfv1); |
179 | | //rfv2 = _mm512_mulhi_epu32(Rv2, rfv2); |
180 | | |
181 | | // Alternatives here: |
182 | | // SHIFT right/left instead of AND: (very marginally slower) |
183 | | // rf1_hm = _mm512_and_epi32( |
184 | | // rf1_hm, _mm512_set1_epi64((uint64_t)0xffffffff00000000)); |
185 | | // vs |
186 | | // rf1_hm = _mm512_srli_epi64(rf1_hm, 32); |
187 | | // rf1_hm = _mm512_slli_epi64(rf1_hm, 32); |
188 | 0 | __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32), |
189 | 0 | _mm512_srli_epi64(rfv1, 32)); |
190 | 0 | __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32), |
191 | 0 | _mm512_srli_epi64(rfv2, 32)); |
192 | 0 | __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32); |
193 | 0 | __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32); |
194 | 0 | rf1_hm = _mm512_and_epi32(rf1_hm, |
195 | 0 | _mm512_set1_epi64((uint64_t)0xffffffff<<32)); |
196 | 0 | rf2_hm = _mm512_and_epi32(rf2_hm, |
197 | 0 | _mm512_set1_epi64((uint64_t)0xffffffff<<32)); |
198 | 0 | rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm); |
199 | 0 | rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm); |
200 | | |
201 | | // Or a pure masked blend approach, sadly slower. |
202 | | // rfv1 = _mm512_mask_blend_epi32(0x5555, |
203 | | // _mm512_mul_epu32( |
204 | | // _mm512_srli_epi64(Rv1, 32), |
205 | | // _mm512_srli_epi64(rfv1, 32)), |
206 | | // _mm512_srli_epi64( |
207 | | // _mm512_mul_epu32(Rv1, rfv1), |
208 | | // 32)); |
209 | | // rfv2 = _mm512_mask_blend_epi32(0xaaaa, |
210 | | // _mm512_srli_epi64( |
211 | | // _mm512_mul_epu32(Rv2, rfv2), |
212 | | // 32), |
213 | | // _mm512_mul_epu32( |
214 | | // _mm512_srli_epi64(Rv2, 32), |
215 | | // _mm512_srli_epi64(rfv2, 32))); |
216 | |
|
217 | 0 | SET512(biasv, SC); |
218 | 0 | __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16); |
219 | 0 | __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16); |
220 | |
|
221 | 0 | __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1); |
222 | 0 | __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2); |
223 | |
|
224 | 0 | qv1 = _mm512_mullo_epi32(qv1, |
225 | 0 | _mm512_and_si512(SDv1,_mm512_set1_epi32(0xffff))); |
226 | 0 | qv1 = _mm512_add_epi32(qv1, biasv1); |
227 | 0 | Rv1 = _mm512_add_epi32(Rv1, qv1); |
228 | |
|
229 | 0 | qv2 = _mm512_mullo_epi32(qv2, |
230 | 0 | _mm512_and_si512(SDv2, _mm512_set1_epi32(0xffff))); |
231 | 0 | qv2 = _mm512_add_epi32(qv2, biasv2); |
232 | 0 | Rv2 = _mm512_add_epi32(Rv2, qv2); |
233 | 0 | } |
234 | 0 | ptr = (uint8_t *)ptr16; |
235 | 0 | STORE512(Rv, ransN); |
236 | |
|
237 | 0 | for (z = 32-1; z >= 0; z--) |
238 | 0 | RansEncFlush(&ransN[z], &ptr); |
239 | | |
240 | 0 | empty: |
241 | | // Finalise block size and return it |
242 | 0 | *out_size = (out_end - ptr) + tab_size; |
243 | |
|
244 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
245 | |
|
246 | 0 | return out; |
247 | 0 | } |
248 | | |
249 | | unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, |
250 | | unsigned int in_size, |
251 | | unsigned char *out, |
252 | 0 | unsigned int out_sz) { |
253 | 0 | if (in_size < 32*4) // 32-states at least |
254 | 0 | return NULL; |
255 | | |
256 | 0 | if (out_sz >= INT_MAX) |
257 | 0 | return NULL; // protect against some overflow cases |
258 | | |
259 | | /* Load in the static tables */ |
260 | 0 | unsigned char *cp = in, *out_free = NULL; |
261 | 0 | unsigned char *cp_end = in + in_size; |
262 | 0 | int i; |
263 | 0 | uint32_t s3[TOTFREQ] __attribute__((aligned(64))); // For TF_SHIFT <= 12 |
264 | |
|
265 | 0 | if (!out) |
266 | 0 | out_free = out = malloc(out_sz); |
267 | 0 | if (!out) |
268 | 0 | return NULL; |
269 | | |
270 | | // Precompute reverse lookup of frequency. |
271 | 0 | uint32_t F[256] = {0}, fsum; |
272 | 0 | int fsz = decode_freq(cp, cp_end, F, &fsum); |
273 | 0 | if (!fsz) |
274 | 0 | goto err; |
275 | 0 | cp += fsz; |
276 | |
|
277 | 0 | normalise_freq_shift(F, fsum, TOTFREQ); |
278 | | |
279 | | // Build symbols; fixme, do as part of decode, see the _d variant |
280 | 0 | if (rans_F_to_s3(F, TF_SHIFT, s3)) |
281 | 0 | goto err; |
282 | | |
283 | 0 | if (cp_end - cp < 32 * 4) |
284 | 0 | goto err; |
285 | | |
286 | 0 | int z; |
287 | 0 | RansState Rv[32] __attribute__((aligned(64))); |
288 | 0 | for (z = 0; z < 32; z++) { |
289 | 0 | RansDecInit(&Rv[z], &cp); |
290 | 0 | if (Rv[z] < RANS_BYTE_L) |
291 | 0 | goto err; |
292 | 0 | } |
293 | | |
294 | 0 | uint16_t *sp = (uint16_t *)cp; |
295 | |
|
296 | 0 | int out_end = (out_sz&~(32-1)); |
297 | 0 | const uint32_t mask = (1u << TF_SHIFT)-1; |
298 | |
|
299 | 0 | __m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes |
300 | 0 | __m512i R1 = _mm512_load_epi32(&Rv[0]); |
301 | 0 | __m512i R2 = _mm512_load_epi32(&Rv[16]); |
302 | | |
303 | | // Start of the first loop iteration, which we do move to the end of the |
304 | | // loop for the next cycle so we can remove some of the instr. latency. |
305 | 0 | __m512i masked1 = _mm512_and_epi32(R1, maskv); |
306 | 0 | __m512i masked2 = _mm512_and_epi32(R2, maskv); |
307 | 0 | __m512i S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3)); |
308 | 0 | __m512i S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3)); |
309 | |
|
310 | 0 | uint8_t overflow[64+64] = {0}; |
311 | 0 | for (i=0; i < out_end; i+=32) { |
312 | | //for (z = 0; z < 16; z++) { |
313 | | |
314 | | // Protect against running off the end of in buffer. |
315 | | // We copy it to a worst-case local buffer when near the end. |
316 | 0 | if ((uint8_t *)sp+64 > cp_end) { |
317 | 0 | memmove(overflow, sp, cp_end - (uint8_t *)sp); |
318 | 0 | sp = (uint16_t *)overflow; |
319 | 0 | cp_end = overflow + sizeof(overflow); |
320 | 0 | } |
321 | | |
322 | | //uint32_t S = s3[R[z] & mask]; |
323 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words |
324 | | |
325 | | //uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask; |
326 | 0 | __m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8); |
327 | 0 | __m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8); |
328 | 0 | __m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv); |
329 | 0 | __m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv); |
330 | | |
331 | | //R[z] = f * (R[z] >> TF_SHIFT) + b; |
332 | | // approx 10 cycle latency on mullo. |
333 | 0 | R1 = _mm512_add_epi32( |
334 | 0 | _mm512_mullo_epi32( |
335 | 0 | _mm512_srli_epi32(R1, TF_SHIFT), f1), b1); |
336 | 0 | R2 = _mm512_add_epi32( |
337 | 0 | _mm512_mullo_epi32( |
338 | 0 | _mm512_srli_epi32(R2, TF_SHIFT), f2), b2); |
339 | | |
340 | | // renorm. this is the interesting part: |
341 | 0 | __mmask16 renorm_mask1, renorm_mask2; |
342 | 0 | renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L)); |
343 | 0 | renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L)); |
344 | | // advance by however many words we actually read |
345 | 0 | sp += _mm_popcnt_u32(renorm_mask1); |
346 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256( |
347 | 0 | (const __m256i *)sp)); |
348 | | |
349 | | // select masked only |
350 | 0 | __m512i renorm_vals1, renorm_vals2; |
351 | 0 | renorm_vals1 = _mm512_maskz_expand_epi32(renorm_mask1, renorm_words1); |
352 | 0 | renorm_vals2 = _mm512_maskz_expand_epi32(renorm_mask2, renorm_words2); |
353 | | // shift & add selected words |
354 | 0 | R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16); |
355 | 0 | R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16); |
356 | 0 | R1 = _mm512_add_epi32(R1, renorm_vals1); |
357 | 0 | R2 = _mm512_add_epi32(R2, renorm_vals2); |
358 | | |
359 | | // For start of next loop iteration. This has been moved here |
360 | | // (and duplicated to before the loop starts) so we can do something |
361 | | // with the latency period of gather, such as finishing up the |
362 | | // renorm offset and writing the results. |
363 | 0 | __m512i S1_ = S1; // temporary copy for use in out[]=S later |
364 | 0 | __m512i S2_ = S2; |
365 | |
|
366 | 0 | masked1 = _mm512_and_epi32(R1, maskv); |
367 | 0 | masked2 = _mm512_and_epi32(R2, maskv); |
368 | | // Gather is slow bit (half total time) - 30 cycle latency. |
369 | 0 | S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3)); |
370 | 0 | S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3)); |
371 | | |
372 | | // advance by however many words we actually read |
373 | 0 | sp += _mm_popcnt_u32(renorm_mask2); |
374 | | |
375 | | //out[i+z] = S; |
376 | 0 | _mm_storeu_si128((__m128i *)(out+i), _mm512_cvtepi32_epi8(S1_)); |
377 | 0 | _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_)); |
378 | 0 | } |
379 | |
|
380 | 0 | _mm512_store_epi32(&Rv[ 0], R1); |
381 | 0 | _mm512_store_epi32(&Rv[16], R2); |
382 | |
|
383 | 0 | for (z = out_sz & (32-1); z-- > 0; ) |
384 | 0 | out[out_end + z] = s3[Rv[z] & mask]; |
385 | |
|
386 | 0 | return out; |
387 | | |
388 | 0 | err: |
389 | 0 | free(out_free); |
390 | 0 | return NULL; |
391 | 0 | } |
392 | | |
393 | | #define TBUF8 |
394 | | #ifdef TBUF8 |
395 | | // 15% quicker overall O1 decode now due to rot32_simd below. |
396 | | |
397 | | // NB: This uses AVX2 though and we could rewrite using AVX512 for |
398 | | // further speed gains. |
399 | | static inline void transpose_and_copy(uint8_t *out, int iN[32], |
400 | 0 | uint8_t t[32][32]) { |
401 | | // int z; |
402 | | // for (z = 0; z < 32; z++) { |
403 | | // int k; |
404 | | // for (k = 0; k < 32; k++) |
405 | | // out[iN[z]+k] = t[k][z]; |
406 | | // iN[z] += 32; |
407 | | // } |
408 | |
|
409 | 0 | rot32_simd(t, out, iN); |
410 | 0 | } |
411 | | |
412 | | #else |
413 | | // Implemented using AVX512 gathers. |
414 | | // This is faster than a naive scalar implementation, but doesn't beat the |
415 | | // AVX2 vectorised 32x32 transpose function. |
416 | | static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32], |
417 | | uint32_t t32[32][32]) { |
418 | | int z; |
419 | | // for (z = 0; z < 32; z++) { |
420 | | // int k; |
421 | | // for (k = 0; k < 32; k++) |
422 | | // out[iN[z]+k] = t32[k][z]; |
423 | | // iN[z] += 32; |
424 | | // } |
425 | | |
426 | | |
427 | | __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, |
428 | | 7, 6, 5, 4, 3, 2, 1, 0); |
429 | | v1 = _mm512_slli_epi32(v1, 5); |
430 | | |
431 | | for (z = 0; z < 32; z++) { |
432 | | __m512i t1 = _mm512_i32gather_epi32(v1, &t32[ 0][z], 4); |
433 | | __m512i t2 = _mm512_i32gather_epi32(v1, &t32[16][z], 4); |
434 | | _mm_storeu_si128((__m128i*)(&out[iN[z] ]), _mm512_cvtepi32_epi8(t1)); |
435 | | _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2)); |
436 | | iN[z] += 32; |
437 | | } |
438 | | } |
439 | | #endif // TBUF |
440 | | |
441 | | unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, |
442 | | unsigned int in_size, |
443 | | unsigned char *out, |
444 | 0 | unsigned int *out_size) { |
445 | 0 | unsigned char *cp, *out_end, *out_free = NULL; |
446 | 0 | unsigned int tab_size; |
447 | 0 | uint32_t bound = rans_compress_bound_4x16(in_size,1)-20; |
448 | 0 | int z; |
449 | 0 | RansState ransN[32] __attribute__((aligned(64))); |
450 | |
|
451 | 0 | if (in_size < 32) // force O0 instead |
452 | 0 | return NULL; |
453 | | |
454 | 0 | if (!out) { |
455 | 0 | *out_size = bound; |
456 | 0 | out = malloc(*out_size); |
457 | 0 | } |
458 | 0 | if (!out || bound > *out_size) |
459 | 0 | return NULL; |
460 | | |
461 | 0 | if (((size_t)out)&1) |
462 | 0 | bound--; |
463 | 0 | out_end = out + bound; |
464 | |
|
465 | 0 | RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms))); |
466 | 0 | if (!syms) { |
467 | 0 | free(out_free); |
468 | 0 | return NULL; |
469 | 0 | } |
470 | | |
471 | 0 | cp = out; |
472 | 0 | int shift = encode_freq1(in, in_size, 32, syms, &cp); |
473 | 0 | if (shift < 0) { |
474 | 0 | free(out_free); |
475 | 0 | htscodecs_tls_free(syms); |
476 | 0 | return NULL; |
477 | 0 | } |
478 | 0 | tab_size = cp - out; |
479 | |
|
480 | 0 | for (z = 0; z < 32; z++) |
481 | 0 | RansEncInit(&ransN[z]); |
482 | |
|
483 | 0 | uint8_t* ptr = out_end; |
484 | |
|
485 | 0 | int iN[32], isz4 = in_size/32; |
486 | 0 | for (z = 0; z < 32; z++) |
487 | 0 | iN[z] = (z+1)*isz4-2; |
488 | |
|
489 | 0 | uint32_t lN[32] __attribute__((aligned(64))); |
490 | 0 | for (z = 0; z < 32; z++) |
491 | 0 | lN[z] = in[iN[z]+1]; |
492 | | |
493 | | // Deal with the remainder |
494 | 0 | z = 32-1; |
495 | 0 | lN[z] = in[in_size-1]; |
496 | 0 | for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) { |
497 | 0 | unsigned char c = in[iN[z]]; |
498 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]); |
499 | 0 | lN[z] = c; |
500 | 0 | } |
501 | |
|
502 | 0 | LOAD512(Rv, ransN); |
503 | |
|
504 | 0 | uint16_t *ptr16 = (uint16_t *)ptr; |
505 | 0 | __m512i last2 = _mm512_set_epi32(lN[31], lN[30], lN[29], lN[28], |
506 | 0 | lN[27], lN[26], lN[25], lN[24], |
507 | 0 | lN[23], lN[22], lN[21], lN[20], |
508 | 0 | lN[19], lN[18], lN[17], lN[16]); |
509 | 0 | __m512i last1 = _mm512_set_epi32(lN[15], lN[14], lN[13], lN[12], |
510 | 0 | lN[11], lN[10], lN[ 9], lN[ 8], |
511 | 0 | lN[ 7], lN[ 6], lN[ 5], lN[ 4], |
512 | 0 | lN[ 3], lN[ 2], lN[ 1], lN[ 0]); |
513 | | |
514 | 0 | __m512i iN2 = _mm512_set_epi32(iN[31], iN[30], iN[29], iN[28], |
515 | 0 | iN[27], iN[26], iN[25], iN[24], |
516 | 0 | iN[23], iN[22], iN[21], iN[20], |
517 | 0 | iN[19], iN[18], iN[17], iN[16]); |
518 | 0 | __m512i iN1 = _mm512_set_epi32(iN[15], iN[14], iN[13], iN[12], |
519 | 0 | iN[11], iN[10], iN[ 9], iN[ 8], |
520 | 0 | iN[ 7], iN[ 6], iN[ 5], iN[ 4], |
521 | 0 | iN[ 3], iN[ 2], iN[ 1], iN[ 0]); |
522 | |
|
523 | 0 | __m512i c1 = _mm512_i32gather_epi32(iN1, in, 1); |
524 | 0 | __m512i c2 = _mm512_i32gather_epi32(iN2, in, 1); |
525 | |
|
526 | 0 | for (; iN[0] >= 0; iN[0]--) { |
527 | | // Note, consider doing the same approach for the AVX2 encoder. |
528 | | // Maybe we can also get gather working well there? |
529 | | // Gather here is still a major latency bottleneck, consuming |
530 | | // around 40% of CPU cycles overall. |
531 | | |
532 | | // FIXME: maybe we need to cope with in[31] read over-flow |
533 | | // on loop cycles 0, 1, 2 where gather reads 32-bits instead of |
534 | | // 8 bits. Use set instead there on c2? |
535 | 0 | c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff)); |
536 | 0 | c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff)); |
537 | | |
538 | | // index into syms[0][0] array, used for x_max, rcp_freq, and bias |
539 | 0 | __m512i vidx1 = _mm512_slli_epi32(c1, 8); |
540 | 0 | __m512i vidx2 = _mm512_slli_epi32(c2, 8); |
541 | 0 | vidx1 = _mm512_add_epi32(vidx1, last1); |
542 | 0 | vidx2 = _mm512_add_epi32(vidx2, last2); |
543 | 0 | vidx1 = _mm512_slli_epi32(vidx1, 2); |
544 | 0 | vidx2 = _mm512_slli_epi32(vidx2, 2); |
545 | | |
546 | | // ------------------------------------------------------------ |
547 | | // for (z = NX-1; z >= 0; z--) { |
548 | | // if (ransN[z] >= x_max[z]) { |
549 | | // *--ptr16 = ransN[z] & 0xffff; |
550 | | // ransN[z] >>= 16; |
551 | | // } |
552 | | // } |
553 | |
|
554 | 0 | #define SET512x(a,x) \ |
555 | 0 | __m512i a##1 = _mm512_i32gather_epi32(vidx1, &syms[0][0].x, 4); \ |
556 | 0 | __m512i a##2 = _mm512_i32gather_epi32(vidx2, &syms[0][0].x, 4) |
557 | | |
558 | | // Start of next loop, moved here to remove latency. |
559 | | // last[z] = c[z] |
560 | | // iN[z]-- |
561 | | // c[z] = in[iN[z]] |
562 | 0 | last1 = c1; |
563 | 0 | last2 = c2; |
564 | 0 | iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1)); |
565 | 0 | iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1)); |
566 | 0 | c1 = _mm512_i32gather_epi32(iN1, in, 1); |
567 | 0 | c2 = _mm512_i32gather_epi32(iN2, in, 1); |
568 | |
|
569 | 0 | SET512x(xmax, x_max); // high latency |
570 | |
|
571 | 0 | uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); |
572 | 0 | int pc1 = _mm_popcnt_u32(gt_mask1); |
573 | 0 | __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); |
574 | 0 | __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); |
575 | 0 | uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); |
576 | 0 | SET512x(SDv, cmpl_freq); // good |
577 | 0 | int pc2 = _mm_popcnt_u32(gt_mask2); |
578 | |
|
579 | 0 | Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); |
580 | 0 | Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); |
581 | |
|
582 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2); |
583 | 0 | ptr16 -= pc2; |
584 | 0 | _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1); |
585 | 0 | ptr16 -= pc1; |
586 | |
|
587 | 0 | Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16); |
588 | 0 | Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16); |
589 | | |
590 | | // ------------------------------------------------------------ |
591 | | // uint32_t q = (uint32_t) (((uint64_t)ransN[z] * rcp_freq[z]) |
592 | | // >> rcp_shift[z]); |
593 | | // ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z]; |
594 | 0 | SET512x(rfv, rcp_freq); // good-ish |
595 | |
|
596 | 0 | __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32), |
597 | 0 | _mm512_srli_epi64(rfv1, 32)); |
598 | 0 | __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32), |
599 | 0 | _mm512_srli_epi64(rfv2, 32)); |
600 | 0 | __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32); |
601 | 0 | __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32); |
602 | |
|
603 | 0 | const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000); |
604 | 0 | rf1_hm = _mm512_and_epi32(rf1_hm, top32); |
605 | 0 | rf2_hm = _mm512_and_epi32(rf2_hm, top32); |
606 | 0 | rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm); |
607 | 0 | rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm); |
608 | |
|
609 | 0 | SET512x(biasv, bias); // good |
610 | 0 | __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16); |
611 | 0 | __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16); |
612 | |
|
613 | 0 | shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32)); |
614 | 0 | shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32)); |
615 | |
|
616 | 0 | __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1); |
617 | 0 | __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2); |
618 | |
|
619 | 0 | const __m512i bot16 = _mm512_set1_epi32(0xffff); |
620 | 0 | qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16)); |
621 | 0 | qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16)); |
622 | |
|
623 | 0 | qv1 = _mm512_add_epi32(qv1, biasv1); |
624 | 0 | Rv1 = _mm512_add_epi32(Rv1, qv1); |
625 | |
|
626 | 0 | qv2 = _mm512_add_epi32(qv2, biasv2); |
627 | 0 | Rv2 = _mm512_add_epi32(Rv2, qv2); |
628 | 0 | } |
629 | |
|
630 | 0 | STORE512(Rv, ransN); |
631 | 0 | STORE512(last, lN); |
632 | |
|
633 | 0 | ptr = (uint8_t *)ptr16; |
634 | |
|
635 | 0 | for (z = 32-1; z>=0; z--) |
636 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); |
637 | |
|
638 | 0 | for (z = 32-1; z >= 0; z--) |
639 | 0 | RansEncFlush(&ransN[z], &ptr); |
640 | | |
641 | | // Finalise block size and return it |
642 | 0 | *out_size = (out_end - ptr) + tab_size; |
643 | | |
644 | | // cp = out; |
645 | | // *cp++ = (in_size>> 0) & 0xff; |
646 | | // *cp++ = (in_size>> 8) & 0xff; |
647 | | // *cp++ = (in_size>>16) & 0xff; |
648 | | // *cp++ = (in_size>>24) & 0xff; |
649 | |
|
650 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
651 | |
|
652 | 0 | htscodecs_tls_free(syms); |
653 | 0 | return out; |
654 | 0 | } |
655 | | |
656 | 0 | #define NX 32 |
657 | | unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, |
658 | | unsigned int in_size, |
659 | | unsigned char *out, |
660 | 0 | unsigned int out_sz) { |
661 | 0 | if (in_size < NX*4) // 4-states at least |
662 | 0 | return NULL; |
663 | | |
664 | 0 | if (out_sz >= INT_MAX) |
665 | 0 | return NULL; // protect against some overflow cases |
666 | | |
667 | | /* Load in the static tables */ |
668 | 0 | unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL; |
669 | 0 | unsigned char *c_freq = NULL; |
670 | |
|
671 | 0 | uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4); |
672 | 0 | if (!s3) |
673 | 0 | return NULL; |
674 | 0 | uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3; |
675 | |
|
676 | 0 | if (!out) |
677 | 0 | out_free = out = malloc(out_sz); |
678 | |
|
679 | 0 | if (!out) |
680 | 0 | goto err; |
681 | | |
682 | | //fprintf(stderr, "out_sz=%d\n", out_sz); |
683 | | |
684 | | // compressed header? If so uncompress it |
685 | 0 | unsigned char *tab_end = NULL; |
686 | 0 | unsigned char *c_freq_end = cp_end; |
687 | 0 | unsigned int shift = *cp >> 4; |
688 | 0 | if (*cp++ & 1) { |
689 | 0 | uint32_t u_freq_sz, c_freq_sz; |
690 | 0 | cp += var_get_u32(cp, cp_end, &u_freq_sz); |
691 | 0 | cp += var_get_u32(cp, cp_end, &c_freq_sz); |
692 | 0 | if (c_freq_sz >= cp_end - cp - 16) |
693 | 0 | goto err; |
694 | 0 | tab_end = cp + c_freq_sz; |
695 | 0 | if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, |
696 | 0 | u_freq_sz))) |
697 | 0 | goto err; |
698 | 0 | cp = c_freq; |
699 | 0 | c_freq_end = c_freq + u_freq_sz; |
700 | 0 | } |
701 | | |
702 | | // Decode order-0 symbol list; avoids needing in order-1 tables |
703 | 0 | cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL); |
704 | |
|
705 | 0 | if (tab_end) |
706 | 0 | cp = tab_end; |
707 | 0 | free(c_freq); |
708 | 0 | c_freq = NULL; |
709 | |
|
710 | 0 | if (cp_end - cp < NX * 4) |
711 | 0 | goto err; |
712 | | |
713 | 0 | RansState R[NX] __attribute__((aligned(64))); |
714 | 0 | uint8_t *ptr = cp, *ptr_end = in + in_size; |
715 | 0 | int z; |
716 | 0 | for (z = 0; z < NX; z++) { |
717 | 0 | RansDecInit(&R[z], &ptr); |
718 | 0 | if (R[z] < RANS_BYTE_L) |
719 | 0 | goto err; |
720 | 0 | } |
721 | | |
722 | 0 | int isz4 = out_sz/NX; |
723 | 0 | int iN[NX], lN[NX] __attribute__((aligned(64))) = {0}; |
724 | 0 | for (z = 0; z < NX; z++) |
725 | 0 | iN[z] = z*isz4; |
726 | |
|
727 | 0 | uint16_t *sp = (uint16_t *)ptr; |
728 | 0 | const uint32_t mask = (1u << shift)-1; |
729 | |
|
730 | 0 | __m512i _maskv = _mm512_set1_epi32(mask); |
731 | 0 | LOAD512(_Rv, R); |
732 | 0 | LOAD512(_Lv, lN); |
733 | |
|
734 | 0 | #ifdef TBUF8 |
735 | 0 | union { |
736 | 0 | unsigned char tbuf[32][32]; |
737 | 0 | uint64_t tbuf64[32][4]; |
738 | 0 | } u; |
739 | | #else |
740 | | uint32_t tbuf[32][32]; |
741 | | #endif |
742 | |
|
743 | 0 | unsigned int tidx = 0; |
744 | |
|
745 | 0 | if (shift == TF_SHIFT_O1) { |
746 | 0 | isz4 -= 64; |
747 | 0 | for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { |
748 | | // m[z] = R[z] & mask; |
749 | 0 | __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); |
750 | 0 | __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); |
751 | | |
752 | | // S[z] = s3[lN[z]][m[z]]; |
753 | 0 | _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1); |
754 | 0 | _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1); |
755 | |
|
756 | 0 | _masked1 = _mm512_add_epi32(_masked1, _Lv1); |
757 | 0 | _masked2 = _mm512_add_epi32(_masked2, _Lv2); |
758 | | |
759 | | // This is the biggest bottleneck |
760 | 0 | __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0], |
761 | 0 | sizeof(s3F[0][0])); |
762 | 0 | __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0], |
763 | 0 | sizeof(s3F[0][0])); |
764 | | |
765 | | // f[z] = S[z]>>(TF_SHIFT_O1+8); |
766 | 0 | __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8); |
767 | 0 | __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8); |
768 | | |
769 | | // b[z] = (S[z]>>8) & mask; |
770 | 0 | __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); |
771 | 0 | __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); |
772 | | |
773 | | // s[z] = S[z] & 0xff; |
774 | 0 | __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); |
775 | 0 | __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); |
776 | | |
777 | | // A maximum frequency of 4096 doesn't fit in our s3 array. |
778 | | // as it's 12 bit + 12 bit + 8 bit. It wraps around to zero. |
779 | | // (We don't have this issue for TOTFREQ_O1_FAST.) |
780 | | // |
781 | | // Solution 1 is to change to spec to forbid freq of 4096. |
782 | | // Easy hack is to add an extra symbol so it sums correctly. |
783 | | // => 572 MB/s on q40 (deskpro). |
784 | | // |
785 | | // Solution 2 implemented here is to look for the wrap around |
786 | | // and fix it. |
787 | | // => 556 MB/s on q40 |
788 | | // cope with max freq of 4096. Only 3% hit |
789 | 0 | __m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1); |
790 | 0 | __m512i zero = _mm512_setzero_si512(); |
791 | 0 | __mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero); |
792 | 0 | __mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero); |
793 | 0 | _fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq); |
794 | 0 | _fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq); |
795 | | |
796 | | // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; |
797 | 0 | _Rv1 = _mm512_add_epi32( |
798 | 0 | _mm512_mullo_epi32( |
799 | 0 | _mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1); |
800 | 0 | _Rv2 = _mm512_add_epi32( |
801 | 0 | _mm512_mullo_epi32( |
802 | 0 | _mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2); |
803 | | |
804 | | //for (z = 0; z < NX; z++) lN[z] = c[z]; |
805 | 0 | _Lv1 = _sv1; |
806 | 0 | _Lv2 = _sv2; |
807 | | |
808 | | // RansDecRenorm(&R[z], &ptr); |
809 | 0 | __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, |
810 | 0 | _mm512_set1_epi32(0x80000000)); |
811 | 0 | __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, |
812 | 0 | _mm512_set1_epi32(0x80000000)); |
813 | |
|
814 | 0 | int _imask1 =_mm512_cmpgt_epi32_mask |
815 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); |
816 | 0 | int _imask2 = _mm512_cmpgt_epi32_mask |
817 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); |
818 | |
|
819 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32 |
820 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
821 | 0 | sp += _mm_popcnt_u32(_imask1); |
822 | |
|
823 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32 |
824 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
825 | 0 | sp += _mm_popcnt_u32(_imask2); |
826 | |
|
827 | 0 | __m512i _renorm_vals1 = |
828 | 0 | _mm512_maskz_expand_epi32(_imask1, renorm_words1); |
829 | 0 | __m512i _renorm_vals2 = |
830 | 0 | _mm512_maskz_expand_epi32(_imask2, renorm_words2); |
831 | |
|
832 | 0 | _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); |
833 | 0 | _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); |
834 | |
|
835 | 0 | _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); |
836 | 0 | _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); |
837 | |
|
838 | 0 | #ifdef TBUF8 |
839 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), |
840 | 0 | _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? |
841 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), |
842 | 0 | _mm512_cvtepi32_epi8(_Sv2)); |
843 | | #else |
844 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); |
845 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); |
846 | | #endif |
847 | |
|
848 | 0 | iN[0]++; |
849 | 0 | if (++tidx == 32) { |
850 | 0 | iN[0]-=32; |
851 | | |
852 | | // We have tidx[x][y] which we want to store in |
853 | | // memory in out[y][z] instead. This is an unrolled |
854 | | // transposition. |
855 | 0 | #ifdef TBUF8 |
856 | 0 | transpose_and_copy(out, iN, u.tbuf); |
857 | | #else |
858 | | transpose_and_copy_avx512(out, iN, tbuf); |
859 | | #endif |
860 | 0 | tidx = 0; |
861 | 0 | } |
862 | 0 | } |
863 | 0 | isz4 += 64; |
864 | |
|
865 | 0 | STORE512(_Rv, R); |
866 | 0 | STORE512(_Lv, lN); |
867 | 0 | ptr = (uint8_t *)sp; |
868 | |
|
869 | 0 | if (1) { |
870 | 0 | iN[0]-=tidx; |
871 | 0 | int T; |
872 | 0 | for (z = 0; z < NX; z++) |
873 | 0 | for (T = 0; T < tidx; T++) |
874 | 0 | #ifdef TBUF8 |
875 | 0 | out[iN[z]++] = u.tbuf[T][z]; |
876 | | #else |
877 | | out[iN[z]++] = tbuf[T][z]; |
878 | | #endif |
879 | 0 | } |
880 | | |
881 | | // Scalar version for close to the end of in[] array so we don't |
882 | | // do SIMD loads beyond the end of the buffer |
883 | 0 | for (; iN[0] < isz4;) { |
884 | 0 | for (z = 0; z < NX; z++) { |
885 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1); |
886 | 0 | uint32_t S = s3[lN[z]][m]; |
887 | 0 | unsigned char c = S & 0xff; |
888 | 0 | out[iN[z]++] = c; |
889 | 0 | uint32_t F = S>>(TF_SHIFT_O1+8); |
890 | 0 | R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + |
891 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1)-1)); |
892 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
893 | 0 | lN[z] = c; |
894 | 0 | } |
895 | 0 | } |
896 | | |
897 | | // Remainder |
898 | 0 | z = NX-1; |
899 | 0 | for (; iN[z] < out_sz; ) { |
900 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1); |
901 | 0 | uint32_t S = s3[lN[z]][m]; |
902 | 0 | unsigned char c = S & 0xff; |
903 | 0 | out[iN[z]++] = c; |
904 | 0 | uint32_t F = S>>(TF_SHIFT_O1+8); |
905 | 0 | R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + |
906 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1)-1)); |
907 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
908 | 0 | lN[z] = c; |
909 | 0 | } |
910 | 0 | } else { |
911 | | // TF_SHIFT_O1_FAST. This is the most commonly used variant. |
912 | | |
913 | | // SIMD version ends decoding early as it reads at most 64 bytes |
914 | | // from input via 4 vectorised loads. |
915 | 0 | isz4 -= 64; |
916 | 0 | for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { |
917 | | // m[z] = R[z] & mask; |
918 | 0 | __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); |
919 | 0 | __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); |
920 | | |
921 | | // S[z] = s3[lN[z]][m[z]]; |
922 | 0 | _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1_FAST); |
923 | 0 | _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1_FAST); |
924 | |
|
925 | 0 | _masked1 = _mm512_add_epi32(_masked1, _Lv1); |
926 | 0 | _masked2 = _mm512_add_epi32(_masked2, _Lv2); |
927 | | |
928 | | // This is the biggest bottleneck |
929 | 0 | __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0], |
930 | 0 | sizeof(s3F[0][0])); |
931 | 0 | __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0], |
932 | 0 | sizeof(s3F[0][0])); |
933 | | |
934 | | // f[z] = S[z]>>(TF_SHIFT_O1+8); |
935 | 0 | __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8); |
936 | 0 | __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8); |
937 | | |
938 | | // b[z] = (S[z]>>8) & mask; |
939 | 0 | __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); |
940 | 0 | __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); |
941 | | |
942 | | // s[z] = S[z] & 0xff; |
943 | 0 | __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); |
944 | 0 | __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); |
945 | | |
946 | | // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; |
947 | 0 | _Rv1 = _mm512_add_epi32( |
948 | 0 | _mm512_mullo_epi32( |
949 | 0 | _mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST), |
950 | 0 | _fv1), _bv1); |
951 | 0 | _Rv2 = _mm512_add_epi32( |
952 | 0 | _mm512_mullo_epi32( |
953 | 0 | _mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST), |
954 | 0 | _fv2), _bv2); |
955 | | |
956 | | //for (z = 0; z < NX; z++) lN[z] = c[z]; |
957 | 0 | _Lv1 = _sv1; |
958 | 0 | _Lv2 = _sv2; |
959 | | |
960 | | // RansDecRenorm(&R[z], &ptr); |
961 | 0 | __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, |
962 | 0 | _mm512_set1_epi32(0x80000000)); |
963 | 0 | __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, |
964 | 0 | _mm512_set1_epi32(0x80000000)); |
965 | |
|
966 | 0 | int _imask1 =_mm512_cmpgt_epi32_mask |
967 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); |
968 | 0 | int _imask2 = _mm512_cmpgt_epi32_mask |
969 | 0 | (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); |
970 | |
|
971 | 0 | __m512i renorm_words1 = _mm512_cvtepu16_epi32 |
972 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
973 | 0 | sp += _mm_popcnt_u32(_imask1); |
974 | |
|
975 | 0 | __m512i renorm_words2 = _mm512_cvtepu16_epi32 |
976 | 0 | (_mm256_loadu_si256((const __m256i *)sp)); |
977 | 0 | sp += _mm_popcnt_u32(_imask2); |
978 | |
|
979 | 0 | __m512i _renorm_vals1 = |
980 | 0 | _mm512_maskz_expand_epi32(_imask1, renorm_words1); |
981 | 0 | __m512i _renorm_vals2 = |
982 | 0 | _mm512_maskz_expand_epi32(_imask2, renorm_words2); |
983 | |
|
984 | 0 | _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); |
985 | 0 | _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); |
986 | |
|
987 | 0 | _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); |
988 | 0 | _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); |
989 | |
|
990 | 0 | #ifdef TBUF8 |
991 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), |
992 | 0 | _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? |
993 | 0 | _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), |
994 | 0 | _mm512_cvtepi32_epi8(_Sv2)); |
995 | | #else |
996 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); |
997 | | _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); |
998 | | #endif |
999 | |
|
1000 | 0 | iN[0]++; |
1001 | 0 | if (++tidx == 32) { |
1002 | 0 | iN[0]-=32; |
1003 | 0 | #ifdef TBUF8 |
1004 | 0 | transpose_and_copy(out, iN, u.tbuf); |
1005 | | #else |
1006 | | transpose_and_copy_avx512(out, iN, tbuf); |
1007 | | #endif |
1008 | 0 | tidx = 0; |
1009 | 0 | } |
1010 | 0 | } |
1011 | 0 | isz4 += 64; |
1012 | |
|
1013 | 0 | STORE512(_Rv, R); |
1014 | 0 | STORE512(_Lv, lN); |
1015 | 0 | ptr = (uint8_t *)sp; |
1016 | |
|
1017 | 0 | if (1) { |
1018 | 0 | iN[0]-=tidx; |
1019 | 0 | int T; |
1020 | 0 | for (z = 0; z < NX; z++) |
1021 | 0 | for (T = 0; T < tidx; T++) |
1022 | 0 | #ifdef TBUF8 |
1023 | 0 | out[iN[z]++] = u.tbuf[T][z]; |
1024 | | #else |
1025 | | out[iN[z]++] = tbuf[T][z]; |
1026 | | #endif |
1027 | 0 | } |
1028 | | |
1029 | | // Scalar version for close to the end of in[] array so we don't |
1030 | | // do SIMD loads beyond the end of the buffer |
1031 | 0 | for (; iN[0] < isz4;) { |
1032 | 0 | for (z = 0; z < NX; z++) { |
1033 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1); |
1034 | 0 | uint32_t S = s3F[lN[z]][m]; |
1035 | 0 | unsigned char c = S & 0xff; |
1036 | 0 | out[iN[z]++] = c; |
1037 | 0 | R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + |
1038 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
1039 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1040 | 0 | lN[z] = c; |
1041 | 0 | } |
1042 | 0 | } |
1043 | | |
1044 | | // Remainder |
1045 | 0 | z = NX-1; |
1046 | 0 | for (; iN[z] < out_sz; ) { |
1047 | 0 | uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1); |
1048 | 0 | uint32_t S = s3F[lN[z]][m]; |
1049 | 0 | unsigned char c = S & 0xff; |
1050 | 0 | out[iN[z]++] = c; |
1051 | 0 | R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + |
1052 | 0 | ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
1053 | 0 | RansDecRenormSafe(&R[z], &ptr, ptr_end); |
1054 | 0 | lN[z] = c; |
1055 | 0 | } |
1056 | 0 | } |
1057 | |
|
1058 | 0 | htscodecs_tls_free(s3); |
1059 | 0 | return out; |
1060 | | |
1061 | 0 | err: |
1062 | 0 | htscodecs_tls_free(s3); |
1063 | 0 | free(out_free); |
1064 | 0 | free(c_freq); |
1065 | |
|
1066 | 0 | return NULL; |
1067 | 0 | } |
1068 | | |
1069 | | #endif // HAVE_AVX512 |