Coverage Report

Created: 2025-11-09 07:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017-2023 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 *
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 *
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *       Institute nor the names of its contributors may be used to endorse
18
 *       or promote products derived from this software without specific
19
 *       prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
/*
35
 * This is an AVX512 implementation of the 32-way interleaved 16-bit rANS.
36
 * For now it only contains an order-0 implementation.  The AVX2 code may
37
 * be used for order-1.
38
 */
39
40
#include "config.h"
41
42
#if defined(__x86_64__) && defined(HAVE_AVX512)
43
44
#include <stdint.h>
45
#include <stdlib.h>
46
#include <stdio.h>
47
#include <string.h>
48
#include <limits.h>
49
#include <x86intrin.h>
50
51
#include "rANS_word.h"
52
#include "rANS_static4x16.h"
53
#define ROT32_SIMD
54
#include "rANS_static16_int.h"
55
#include "varint.h"
56
#include "utils.h"
57
58
#ifndef USE_GATHER
59
60
// Speds with Downfall mitigation Off/On and Zen4.
61
//         <------ AVX512 --------->   <------- AVX2 -------->
62
// -o4:    IntelOff IntelOn  AMDZen4   
63
// gcc7    544/1673 562/1711 448/1818  649/1515 645/1525 875/1624
64
// gcc13   557/1672 576/1711 582/1761  630/1623 629/1652 866/1762
65
// clang10 541/1547 564/1630 807/1912  620/1456 637/1481 837/1606
66
// clang16 533/1431 555/1510 890/1611  629/1370 627/1406 996/1432
67
//
68
// Zen4 encode is particularly slow with gcc, but AVX2 encode is
69
// faster and we use that already.
70
0
static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) {
71
0
    uint32_t *b = (uint32_t *)v;
72
73
#ifndef __clang__
74
    volatile
75
#endif
76
0
    int c[16] __attribute__((aligned(32)));
77
78
    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
79
0
    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
80
0
    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
81
82
0
    __m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1);
83
0
    __m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1);
84
0
    __m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1);
85
0
    __m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1);
86
87
0
    __m128i x01 = _mm_unpacklo_epi64(x0, x1);
88
0
    __m128i x23 = _mm_unpacklo_epi64(x2, x3);
89
0
    __m256i y0 =_mm256_castsi128_si256(x01);
90
91
0
    __m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1);
92
0
    __m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1);
93
0
    __m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1);
94
0
    __m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1);
95
96
0
    __m128i x45 = _mm_unpacklo_epi64(x4, x5);
97
0
    __m128i x67 = _mm_unpacklo_epi64(x6, x7);
98
0
    __m256i y1 =_mm256_castsi128_si256(x45);
99
100
0
    y0 = _mm256_inserti128_si256(y0, x23, 1);
101
0
    y1 = _mm256_inserti128_si256(y1, x67, 1);
102
103
0
    return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1);
104
0
}
105
106
// 32-bit indices, 8-bit quantities into 32-bit lanes
107
0
static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) {
108
0
    uint8_t *b = (uint8_t *)v;
109
0
    volatile int c[16] __attribute__((aligned(32)));
110
111
    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
112
0
    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
113
0
    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
114
115
0
    return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]],
116
0
                            b[c[11]], b[c[10]], b[c[9]], b[c[8]],
117
0
                            b[c[7]], b[c[6]], b[c[5]], b[c[4]],
118
0
                            b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
119
0
}
120
121
#else
122
// real gathers
123
#define _mm512_i32gather_epi32x _mm512_i32gather_epi32
124
#endif
125
126
unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
127
                                             unsigned int in_size,
128
                                             unsigned char *out,
129
0
                                             unsigned int *out_size) {
130
0
    unsigned char *cp, *out_end;
131
0
    RansEncSymbol syms[256];
132
0
    RansState ransN[32] __attribute__((aligned(64)));
133
0
    uint8_t* ptr;
134
0
    uint32_t F[256+MAGIC] = {0};
135
0
    int i, j, tab_size = 0, x, z;
136
    // -20 for order/size/meta
137
0
    uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;
138
139
0
    if (!out) {
140
0
        *out_size = bound;
141
0
        out = malloc(*out_size);
142
0
    }
143
0
    if (!out || bound > *out_size)
144
0
        return NULL;
145
146
    // If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
147
    // We already added more round in bound to allow for this.
148
0
    if (((size_t)out)&1)
149
0
        bound--;
150
0
    ptr = out_end = out + bound;
151
152
0
    if (in_size == 0)
153
0
        goto empty;
154
155
    // Compute statistics
156
0
    if (hist8(in, in_size, F) < 0)
157
0
        return NULL;
158
159
    // Normalise so frequences sum to power of 2
160
0
    uint32_t fsum = in_size;
161
0
    uint32_t max_val = round2(fsum);
162
0
    if (max_val > TOTFREQ)
163
0
        max_val = TOTFREQ;
164
165
0
    if (normalise_freq(F, fsum, max_val) < 0)
166
0
        return NULL;
167
0
    fsum=max_val;
168
169
0
    cp = out;
170
0
    cp += encode_freq(cp, F);
171
0
    tab_size = cp-out;
172
    //write(2, out+4, cp-(out+4));
173
174
0
    if (normalise_freq(F, fsum, TOTFREQ) < 0)
175
0
        return NULL;
176
177
    // Encode statistics and build lookup tables for SIMD encoding.
178
0
    uint32_t SB[256], SA[256], SD[256], SC[256];
179
0
    for (x = j = 0; j < 256; j++) {
180
0
        if (F[j]) {
181
0
            RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
182
0
            SB[j] = syms[j].x_max;
183
0
            SA[j] = syms[j].rcp_freq;
184
0
            SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16);
185
0
            SC[j] = syms[j].bias;
186
0
            x += F[j];
187
0
        }
188
0
    }
189
190
0
    for (z = 0; z < 32; z++)
191
0
      RansEncInit(&ransN[z]);
192
193
0
    z = i = in_size&(32-1);
194
0
    while (z-- > 0)
195
0
      RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]);
196
197
0
#define LOAD512(a,b)                                     \
198
0
    __m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \
199
0
    __m512i a##2 = _mm512_load_si512((__m512i *)&b[16]);
200
201
0
#define STORE512(a,b)                           \
202
0
    _mm512_store_si512((__m256i *)&b[0], a##1); \
203
0
    _mm512_store_si512((__m256i *)&b[16], a##2);
204
205
0
    LOAD512(Rv, ransN);
206
207
0
    uint16_t *ptr16 = (uint16_t *)ptr;
208
0
    for (i=(in_size &~(32-1)); i>0; i-=32) {
209
0
        uint8_t *c = &in[i-32];
210
211
        // GATHER versions
212
        // Much faster now we have an efficient loadu mechanism in place,
213
        // BUT...
214
        // Try this for avx2 variant too?  Better way to populate the mm256
215
        // regs for mix of avx2 and avx512 opcodes.
216
0
        __m256i c12 = _mm256_loadu_si256((__m256i const *)c);
217
0
        __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0));
218
0
        __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1));
219
0
#define SET512(a,b) \
220
0
        __m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \
221
0
        __m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4)
222
223
0
        SET512(xmax, SB);
224
225
0
        uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
226
0
        int pc1 = _mm_popcnt_u32(gt_mask1);
227
0
        __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
228
0
        __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
229
0
        uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
230
0
        SET512(SDv,  SD);
231
0
        int pc2 = _mm_popcnt_u32(gt_mask2);
232
233
0
        Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
234
0
        Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
235
236
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
237
0
        ptr16 -= pc2;
238
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
239
0
        ptr16 -= pc1;
240
241
0
        SET512(rfv,  SA);
242
0
        Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
243
0
        Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
244
245
        // interleaved form of this, helps on icc a bit
246
        //rfv1 = _mm512_mulhi_epu32(Rv1, rfv1);
247
        //rfv2 = _mm512_mulhi_epu32(Rv2, rfv2);
248
249
        // Alternatives here:
250
        // SHIFT right/left instead of AND: (very marginally slower)
251
        //   rf1_hm = _mm512_and_epi32(
252
        //           rf1_hm, _mm512_set1_epi64((uint64_t)0xffffffff00000000));
253
        // vs
254
        //   rf1_hm = _mm512_srli_epi64(rf1_hm, 32);
255
        //   rf1_hm = _mm512_slli_epi64(rf1_hm, 32);
256
0
        __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1,  32),
257
0
                                          _mm512_srli_epi64(rfv1, 32));
258
0
        __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2,  32),
259
0
                                          _mm512_srli_epi64(rfv2, 32));
260
0
        __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
261
0
        __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
262
0
        rf1_hm = _mm512_and_epi32(rf1_hm,
263
0
                                  _mm512_set1_epi64((uint64_t)0xffffffff<<32));
264
0
        rf2_hm = _mm512_and_epi32(rf2_hm,
265
0
                                  _mm512_set1_epi64((uint64_t)0xffffffff<<32));
266
0
        rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
267
0
        rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
268
269
        // Or a pure masked blend approach, sadly slower.
270
        // rfv1 = _mm512_mask_blend_epi32(0x5555,
271
        //            _mm512_mul_epu32(
272
        //                _mm512_srli_epi64(Rv1, 32),
273
        //                _mm512_srli_epi64(rfv1, 32)),
274
        //            _mm512_srli_epi64(
275
        //                _mm512_mul_epu32(Rv1, rfv1),
276
        //                32));
277
        // rfv2 = _mm512_mask_blend_epi32(0xaaaa,
278
        //            _mm512_srli_epi64(
279
        //                _mm512_mul_epu32(Rv2, rfv2),
280
        //                32),
281
        //            _mm512_mul_epu32(
282
        //                _mm512_srli_epi64(Rv2, 32),
283
        //                _mm512_srli_epi64(rfv2, 32)));
284
285
0
        SET512(biasv, SC);
286
0
        __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
287
0
        __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
288
289
0
        __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
290
0
        __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
291
292
0
        qv1 = _mm512_mullo_epi32(qv1,
293
0
                  _mm512_and_si512(SDv1,_mm512_set1_epi32(0xffff)));
294
0
        qv1 = _mm512_add_epi32(qv1, biasv1);
295
0
        Rv1 = _mm512_add_epi32(Rv1, qv1);
296
297
0
        qv2 = _mm512_mullo_epi32(qv2,
298
0
                  _mm512_and_si512(SDv2, _mm512_set1_epi32(0xffff)));
299
0
        qv2 = _mm512_add_epi32(qv2, biasv2);
300
0
        Rv2 = _mm512_add_epi32(Rv2, qv2);
301
0
    }
302
0
    ptr = (uint8_t *)ptr16;
303
0
    STORE512(Rv, ransN);
304
305
0
    for (z = 32-1; z >= 0; z--)
306
0
        RansEncFlush(&ransN[z], &ptr);
307
308
0
 empty:
309
    // Finalise block size and return it
310
0
    *out_size = (out_end - ptr) + tab_size;
311
312
0
    memmove(out + tab_size, ptr, out_end-ptr);
313
314
0
    return out;
315
0
}
316
317
unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
318
                                               unsigned int in_size,
319
                                               unsigned char *out,
320
0
                                               unsigned int out_sz) {
321
0
    if (in_size < 32*4) // 32-states at least
322
0
        return NULL;
323
324
0
    if (out_sz >= INT_MAX)
325
0
        return NULL; // protect against some overflow cases
326
327
    /* Load in the static tables */
328
0
    unsigned char *cp = in, *out_free = NULL;
329
0
    unsigned char *cp_end = in + in_size;
330
0
    int i;
331
0
    uint32_t s3[TOTFREQ]  __attribute__((aligned(64))); // For TF_SHIFT <= 12
332
333
0
    if (!out)
334
0
        out_free = out = malloc(out_sz);
335
0
    if (!out)
336
0
        return NULL;
337
338
    // Precompute reverse lookup of frequency.
339
0
    uint32_t F[256] = {0}, fsum;
340
0
    int fsz = decode_freq(cp, cp_end, F, &fsum);
341
0
    if (!fsz)
342
0
        goto err;
343
0
    cp += fsz;
344
345
0
    normalise_freq_shift(F, fsum, TOTFREQ);
346
347
    // Build symbols; fixme, do as part of decode, see the _d variant
348
0
    if (rans_F_to_s3(F, TF_SHIFT, s3))
349
0
        goto err;
350
351
0
    if (cp_end - cp < 32 * 4)
352
0
        goto err;
353
354
0
    int z;
355
0
    RansState Rv[32] __attribute__((aligned(64)));
356
0
    for (z = 0; z < 32; z++) {
357
0
        RansDecInit(&Rv[z], &cp);
358
0
        if (Rv[z] < RANS_BYTE_L)
359
0
            goto err;
360
0
    }
361
362
0
    uint16_t *sp = (uint16_t *)cp;
363
364
0
    int out_end = (out_sz&~(32-1));
365
0
    const uint32_t mask = (1u << TF_SHIFT)-1;
366
367
0
    __m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes
368
0
    __m512i R1 = _mm512_load_epi32(&Rv[0]);
369
0
    __m512i R2 = _mm512_load_epi32(&Rv[16]);
370
371
    // Start of the first loop iteration, which we do move to the end of the
372
    // loop for the next cycle so we can remove some of the instr. latency.
373
0
    __m512i masked1 = _mm512_and_epi32(R1, maskv);
374
0
    __m512i masked2 = _mm512_and_epi32(R2, maskv);
375
0
    __m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
376
0
    __m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
377
378
0
    uint8_t overflow[64+64] = {0};
379
0
    for (i=0; i < out_end; i+=32) {
380
      //for (z = 0; z < 16; z++) {
381
382
      // Protect against running off the end of in buffer.
383
      // We copy it to a worst-case local buffer when near the end.
384
0
      if ((uint8_t *)sp+64 > cp_end) {
385
0
        memmove(overflow, sp, cp_end - (uint8_t *)sp);
386
0
        sp = (uint16_t *)overflow;
387
0
        cp_end = overflow + sizeof(overflow);
388
0
      }
389
390
      //uint32_t S = s3[R[z] & mask];
391
0
      __m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words
392
393
      //uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask;
394
0
      __m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8);
395
0
      __m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8);
396
0
      __m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv);
397
0
      __m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv);
398
399
      //R[z] = f * (R[z] >> TF_SHIFT) + b;
400
      // approx 10 cycle latency on mullo.
401
0
      R1 = _mm512_add_epi32(
402
0
               _mm512_mullo_epi32(
403
0
                   _mm512_srli_epi32(R1, TF_SHIFT), f1), b1);
404
0
      __mmask16 renorm_mask1, renorm_mask2;
405
0
      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
406
0
      R2 = _mm512_add_epi32(
407
0
               _mm512_mullo_epi32(
408
0
                   _mm512_srli_epi32(R2, TF_SHIFT), f2), b2);
409
410
      // renorm. this is the interesting part:
411
0
      renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
412
      // advance by however many words we actually read
413
0
      sp += _mm_popcnt_u32(renorm_mask1);
414
0
      __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256(
415
0
                                      (const __m256i *)sp));
416
417
      // select masked only
418
0
      __m512i renorm_vals1, renorm_vals2;
419
420
0
      renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1);
421
0
      renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2);
422
423
      // For start of next loop iteration.  This has been moved here
424
      // (and duplicated to before the loop starts) so we can do something
425
      // with the latency period of gather, such as finishing up the
426
      // renorm offset and writing the results.
427
0
      __m512i S1_ = S1; // temporary copy for use in out[]=S later
428
0
      __m512i S2_ = S2;
429
430
0
      masked1 = _mm512_and_epi32(renorm_vals1, maskv);
431
0
      S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
432
0
      masked2 = _mm512_and_epi32(renorm_vals2, maskv);
433
0
      S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
434
435
0
      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
436
0
      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
437
438
0
      __m512i m16 = _mm512_set1_epi32(0xffff);
439
0
      renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16);
440
0
      renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
441
442
      // advance by however many words we actually read
443
0
      sp += _mm_popcnt_u32(renorm_mask2);
444
445
0
      R1 = _mm512_add_epi32(R1, renorm_vals1);
446
0
      R2 = _mm512_add_epi32(R2, renorm_vals2);
447
448
      //out[i+z] = S;
449
0
      _mm_storeu_si128((__m128i *)(out+i),    _mm512_cvtepi32_epi8(S1_));
450
0
      _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_));
451
0
    }
452
453
0
    _mm512_store_epi32(&Rv[ 0], R1);
454
0
    _mm512_store_epi32(&Rv[16], R2);
455
456
0
    for (z = out_sz & (32-1); z-- > 0; )
457
0
      out[out_end + z] = s3[Rv[z] & mask];
458
459
0
    return out;
460
461
0
 err:
462
0
    free(out_free);
463
0
    return NULL;
464
0
}
465
466
#define TBUF8
467
#ifdef TBUF8
468
// 15% quicker overall O1 decode now due to rot32_simd below.
469
470
// NB: This uses AVX2 though and we could rewrite using AVX512 for
471
// further speed gains.
472
static inline void transpose_and_copy(uint8_t *out, int iN[32],
473
0
                                      uint8_t t[32][32]) {
474
//  int z;
475
//  for (z = 0; z < 32; z++) {
476
//      int k;
477
//      for (k = 0; k < 32; k++)
478
//          out[iN[z]+k] = t[k][z];
479
//      iN[z] += 32;
480
//  }
481
482
0
    rot32_simd(t, out, iN);
483
0
}
484
485
#else
486
// Implemented using AVX512 gathers.
487
// This is faster than a naive scalar implementation, but doesn't beat the
488
// AVX2 vectorised 32x32 transpose function.
489
static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32],
490
                                             uint32_t t32[32][32]) {
491
    int z;
492
//  for (z = 0; z < 32; z++) {
493
//      int k;
494
//      for (k = 0; k < 32; k++)
495
//          out[iN[z]+k] = t32[k][z];
496
//      iN[z] += 32;
497
//  }
498
499
500
    __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10,  9,  8,
501
                                   7,  6,  5,  4,  3,  2,  1,  0);
502
    v1 = _mm512_slli_epi32(v1, 5);
503
504
    for (z = 0; z < 32; z++) {
505
        __m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4);
506
        __m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4);
507
        _mm_storeu_si128((__m128i*)(&out[iN[z]   ]), _mm512_cvtepi32_epi8(t1));
508
        _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2));
509
        iN[z] += 32;
510
    }
511
}
512
#endif // TBUF
513
514
unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
515
                                             unsigned int in_size,
516
                                             unsigned char *out,
517
0
                                             unsigned int *out_size) {
518
0
    unsigned char *cp, *out_end, *out_free = NULL;
519
0
    unsigned int tab_size;
520
0
    uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
521
0
    int z;
522
0
    RansState ransN[32] __attribute__((aligned(64)));
523
524
0
    if (in_size < 32) // force O0 instead
525
0
        return NULL;
526
527
0
    if (!out) {
528
0
        *out_size = bound;
529
0
        out = malloc(*out_size);
530
0
    }
531
0
    if (!out || bound > *out_size)
532
0
        return NULL;
533
534
0
    if (((size_t)out)&1)
535
0
        bound--;
536
0
    out_end = out + bound;
537
538
0
    RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms)));
539
0
    if (!syms) {
540
0
        free(out_free);
541
0
        return NULL;
542
0
    }
543
544
0
    cp = out;
545
0
    int shift = encode_freq1(in, in_size, 32, syms, &cp);
546
0
    if (shift < 0) {
547
0
        free(out_free);
548
0
        htscodecs_tls_free(syms);
549
0
        return NULL;
550
0
    }
551
0
    tab_size = cp - out;
552
553
0
    for (z = 0; z < 32; z++)
554
0
      RansEncInit(&ransN[z]);
555
556
0
    uint8_t* ptr = out_end;
557
558
0
    int iN[32] __attribute__((aligned(64)));
559
0
    int isz4 = in_size/32;
560
0
    for (z = 0; z < 32; z++)
561
0
        iN[z] = (z+1)*isz4-2;
562
563
0
    uint32_t lN[32] __attribute__((aligned(64)));
564
0
    for (z = 0; z < 32; z++)
565
0
        lN[z] = in[iN[z]+1];
566
567
    // Deal with the remainder
568
0
    z = 32-1;
569
0
    lN[z] = in[in_size-1];
570
0
    for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) {
571
0
        unsigned char c = in[iN[z]];
572
0
        RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]);
573
0
        lN[z] = c;
574
0
    }
575
576
0
    LOAD512(Rv, ransN);
577
578
0
    uint16_t *ptr16 = (uint16_t *)ptr;
579
0
    LOAD512(iN, iN);
580
0
    LOAD512(last, lN);
581
582
0
    __m512i c1 = _mm512_i32gather_epi32x1(iN1, in);
583
0
    __m512i c2 = _mm512_i32gather_epi32x1(iN2, in);
584
585
    // We cache the next 64-bytes locally and transpose.
586
    // This means we can load 32 ints from t32[x] with load instructions
587
    // instead of gathers.  The copy, transpose and expand is easier done
588
    // in scalar code.
589
0
#define BATCH 64
590
0
    uint8_t t32[BATCH][32] __attribute__((aligned(64)));
591
0
    int next_batch;
592
0
    if (iN[0] > BATCH) {
593
0
        int i, j;
594
0
        for (i = 0; i < BATCH; i++)
595
            // memcpy(c[i], &in[iN[i]-32], 32); fast mode
596
0
            for (j = 0; j < 32; j++)
597
0
                t32[BATCH-1-i][j] = in[iN[j]-1-i];
598
0
        next_batch = BATCH;
599
0
    } else {
600
0
        next_batch = -1;
601
0
    }
602
603
0
    for (; iN[0] >= 0; iN[0]--) {
604
        // Note, consider doing the same approach for the AVX2 encoder.
605
        // Maybe we can also get gather working well there?
606
        // Gather here is still a major latency bottleneck, consuming
607
        // around 40% of CPU cycles overall.
608
609
        // FIXME: maybe we need to cope with in[31] read over-flow
610
        // on loop cycles 0, 1, 2 where gather reads 32-bits instead of
611
        // 8 bits.  Use set instead there on c2?
612
613
        // index into syms[0][0] array, used for x_max, rcp_freq, and bias
614
0
        __m512i vidx1 = _mm512_slli_epi32(c1, 8);
615
0
        __m512i vidx2 = _mm512_slli_epi32(c2, 8);
616
0
        vidx1 = _mm512_add_epi32(vidx1, last1);
617
0
        vidx2 = _mm512_add_epi32(vidx2, last2);
618
0
        vidx1 = _mm512_slli_epi32(vidx1, 2);
619
0
        vidx2 = _mm512_slli_epi32(vidx2, 2);
620
621
        // ------------------------------------------------------------
622
        //      for (z = NX-1; z >= 0; z--) {
623
        //          if (ransN[z] >= x_max[z]) {
624
        //              *--ptr16 = ransN[z] & 0xffff;
625
        //              ransN[z] >>= 16;
626
        //          }
627
        //      }
628
629
0
#define SET512x(a,x) \
630
0
        __m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \
631
0
        __m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4)
632
633
        // Start of next loop, moved here to remove latency.
634
        // last[z] = c[z]
635
        // iN[z]--
636
        // c[z] = in[iN[z]]
637
0
        last1 = c1;
638
0
        last2 = c2;
639
0
        iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1));
640
0
        iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1));
641
642
        // Code below is equivalent to this:
643
        // c1 = _mm512_i32gather_epi32(iN1, in, 1);
644
        // c2 = _mm512_i32gather_epi32(iN2, in, 1);
645
646
        // Better when we have a power of 2
647
0
        if (next_batch >= 0) {
648
0
            if (--next_batch < 0 && iN[0] > BATCH) {
649
                // Load 32 BATCH blocks of data.
650
                // Executed once every BATCH cycles
651
0
                int i, j;
652
0
                uint8_t c[32][BATCH];
653
0
                iN[0] += BATCH;
654
0
                for (j = 0; j < 32; j++) {
655
0
                    iN[j] -= BATCH;
656
0
                    memcpy(c[j], &in[iN[j]-BATCH], BATCH);
657
0
                }
658
                // transpose matrix from 32xBATCH to BATCHx32
659
0
                for (j = 0; j < 32; j++) {
660
0
                    for (i = 0; i < BATCH; i+=16) {
661
0
                        int z;
662
0
                        for (z = 0; z < 16; z++)
663
0
                            t32[i+z][j] = c[j][i+z];
664
0
                    }
665
0
                }
666
0
                next_batch = BATCH-1;
667
0
            }
668
0
            if (next_batch >= 0) {
669
                // Copy from our of our pre-loaded BATCHx32 tables
670
                // Executed every cycles
671
0
                __m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]);
672
0
                __m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]);
673
0
                c1 = _mm512_cvtepu8_epi32(c1_);
674
0
                c2 = _mm512_cvtepu8_epi32(c2_);
675
0
            }
676
0
        }
677
678
0
        if (next_batch < 0 && iN[0]) {
679
            // no pre-loaded data as within BATCHx32 of input end
680
0
            c1 = _mm512_i32gather_epi32x1(iN1, in);
681
0
            c2 = _mm512_i32gather_epi32x1(iN2, in);
682
683
            // Speeds up clang, even though not needed any more.
684
            // Harmless to leave here.
685
0
            c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
686
0
            c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
687
0
        }
688
        // End of "equivalent to" code block
689
690
0
        SET512x(xmax, x_max); // high latency
691
692
0
        uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
693
0
        int pc1 = _mm_popcnt_u32(gt_mask1);
694
0
        __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
695
0
        __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
696
0
        uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
697
0
        SET512x(SDv, cmpl_freq); // good
698
0
        int pc2 = _mm_popcnt_u32(gt_mask2);
699
700
0
        Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
701
0
        Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
702
703
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
704
0
        ptr16 -= pc2;
705
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
706
0
        ptr16 -= pc1;
707
708
0
        Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
709
0
        Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
710
711
        // ------------------------------------------------------------
712
        // uint32_t q = (uint32_t) (((uint64_t)ransN[z] * rcp_freq[z])
713
        //                          >> rcp_shift[z]);
714
        // ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z];
715
0
        SET512x(rfv, rcp_freq); // good-ish
716
717
0
        __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32),
718
0
                                          _mm512_srli_epi64(rfv1, 32));
719
0
        __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32),
720
0
                                          _mm512_srli_epi64(rfv2, 32));
721
0
        __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
722
0
        __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
723
724
0
        const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000);
725
0
        rf1_hm = _mm512_and_epi32(rf1_hm, top32);
726
0
        rf2_hm = _mm512_and_epi32(rf2_hm, top32);
727
0
        rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
728
0
        rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
729
730
0
        SET512x(biasv, bias); // good
731
0
        __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
732
0
        __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
733
734
0
        shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32));
735
0
        shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32));
736
737
0
        __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
738
0
        __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
739
740
0
        const __m512i bot16 = _mm512_set1_epi32(0xffff);
741
0
        qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16));
742
0
        qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16));
743
744
0
        qv1 = _mm512_add_epi32(qv1, biasv1);
745
0
        Rv1 = _mm512_add_epi32(Rv1, qv1);
746
747
0
        qv2 = _mm512_add_epi32(qv2, biasv2);
748
0
        Rv2 = _mm512_add_epi32(Rv2, qv2);
749
0
    }
750
751
0
    STORE512(Rv, ransN);
752
0
    STORE512(last, lN);
753
754
0
    ptr = (uint8_t *)ptr16;
755
756
0
    for (z = 32-1; z>=0; z--)
757
0
        RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
758
759
0
    for (z = 32-1; z >= 0; z--)
760
0
        RansEncFlush(&ransN[z], &ptr);
761
762
    // Finalise block size and return it
763
0
    *out_size = (out_end - ptr) + tab_size;
764
765
//    cp = out;
766
//    *cp++ = (in_size>> 0) & 0xff;
767
//    *cp++ = (in_size>> 8) & 0xff;
768
//    *cp++ = (in_size>>16) & 0xff;
769
//    *cp++ = (in_size>>24) & 0xff;
770
771
0
    memmove(out + tab_size, ptr, out_end-ptr);
772
773
0
    htscodecs_tls_free(syms);
774
0
    return out;
775
0
}
776
777
0
#define NX 32
778
unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
779
                                               unsigned int in_size,
780
                                               unsigned char *out,
781
0
                                               unsigned int out_sz) {
782
0
    if (in_size < NX*4) // 4-states at least
783
0
        return NULL;
784
785
0
    if (out_sz >= INT_MAX)
786
0
        return NULL; // protect against some overflow cases
787
788
    /* Load in the static tables */
789
0
    unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
790
0
    unsigned char *c_freq = NULL;
791
792
0
    uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4);
793
0
    if (!s3)
794
0
        return NULL;
795
0
    uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3;
796
797
0
    if (!out)
798
0
        out_free = out = malloc(out_sz);
799
800
0
    if (!out)
801
0
        goto err;
802
803
    //fprintf(stderr, "out_sz=%d\n", out_sz);
804
805
    // compressed header? If so uncompress it
806
0
    unsigned char *tab_end = NULL;
807
0
    unsigned char *c_freq_end = cp_end;
808
0
    unsigned int shift = *cp >> 4;
809
0
    if (*cp++ & 1) {
810
0
        uint32_t u_freq_sz, c_freq_sz;
811
0
        cp += var_get_u32(cp, cp_end, &u_freq_sz);
812
0
        cp += var_get_u32(cp, cp_end, &c_freq_sz);
813
0
        if (c_freq_sz > cp_end - cp)
814
0
            goto err;
815
0
        tab_end = cp + c_freq_sz;
816
0
        if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL,
817
0
                                               u_freq_sz)))
818
0
            goto err;
819
0
        cp = c_freq;
820
0
        c_freq_end = c_freq + u_freq_sz;
821
0
    }
822
823
    // Decode order-0 symbol list; avoids needing in order-1 tables
824
0
    cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL);
825
826
0
    if (tab_end)
827
0
        cp = tab_end;
828
0
    free(c_freq);
829
0
    c_freq = NULL;
830
831
0
    if (cp_end - cp < NX * 4)
832
0
        goto err;
833
834
0
    RansState R[NX] __attribute__((aligned(64)));
835
0
    uint8_t *ptr = cp, *ptr_end = in + in_size;
836
0
    int z;
837
0
    for (z = 0; z < NX; z++) {
838
0
        RansDecInit(&R[z], &ptr);
839
0
        if (R[z] < RANS_BYTE_L)
840
0
            goto err;
841
0
    }
842
843
0
    int isz4 = out_sz/NX;
844
0
    int iN[NX], lN[NX] __attribute__((aligned(64))) = {0};
845
0
    for (z = 0; z < NX; z++)
846
0
        iN[z] = z*isz4;
847
848
0
    uint16_t *sp = (uint16_t *)ptr;
849
0
    const uint32_t mask = (1u << shift)-1;
850
851
0
    __m512i _maskv  = _mm512_set1_epi32(mask);
852
0
    LOAD512(_Rv, R);
853
0
    LOAD512(_Lv, lN);
854
855
0
#ifdef TBUF8
856
0
    union {
857
0
        unsigned char tbuf[32][32];
858
0
        uint64_t tbuf64[32][4];
859
0
    } u __attribute__((aligned(32)));
860
#else
861
    uint32_t tbuf[32][32];
862
#endif
863
864
0
    unsigned int tidx = 0;
865
866
0
    if (shift == TF_SHIFT_O1) {
867
0
        isz4 -= 64;
868
0
        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
869
            // m[z] = R[z] & mask;
870
0
            __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
871
0
            __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
872
873
            //  S[z] = s3[lN[z]][m[z]];
874
0
            _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1);
875
0
            _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1);
876
877
0
            _masked1 = _mm512_add_epi32(_masked1, _Lv1);
878
0
            _masked2 = _mm512_add_epi32(_masked2, _Lv2);
879
880
            // This is the biggest bottleneck
881
0
            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
882
0
                                                   sizeof(s3F[0][0]));
883
0
            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
884
0
                                                   sizeof(s3F[0][0]));
885
886
            //  f[z] = S[z]>>(TF_SHIFT_O1+8);
887
0
            __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8);
888
0
            __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8);
889
890
            //  b[z] = (S[z]>>8) & mask;
891
0
            __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
892
0
            __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
893
894
            //  s[z] = S[z] & 0xff;
895
0
            __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
896
0
            __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
897
898
            // A maximum frequency of 4096 doesn't fit in our s3 array.
899
            // as it's 12 bit + 12 bit + 8 bit.  It wraps around to zero.
900
            // (We don't have this issue for TOTFREQ_O1_FAST.)
901
            //
902
            // Solution 1 is to change to spec to forbid freq of 4096.
903
            // Easy hack is to add an extra symbol so it sums correctly.
904
            // => 572 MB/s on q40 (deskpro).
905
            //
906
            // Solution 2 implemented here is to look for the wrap around
907
            // and fix it.
908
            // => 556 MB/s on q40
909
            // cope with max freq of 4096.  Only 3% hit
910
0
            __m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1);
911
0
            __m512i zero = _mm512_setzero_si512();
912
0
            __mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero);
913
0
            __mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero);
914
0
            _fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq);
915
0
            _fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq);
916
917
            //  R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
918
0
            _Rv1 = _mm512_add_epi32(
919
0
                       _mm512_mullo_epi32(
920
0
                           _mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1);
921
0
            _Rv2 = _mm512_add_epi32(
922
0
                       _mm512_mullo_epi32(
923
0
                           _mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2);
924
925
            //for (z = 0; z < NX; z++) lN[z] = c[z];
926
0
            _Lv1 = _sv1;
927
0
            _Lv2 = _sv2;
928
929
            // RansDecRenorm(&R[z], &ptr);
930
0
            __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
931
0
                                        _mm512_set1_epi32(0x80000000));
932
0
            __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
933
0
                                        _mm512_set1_epi32(0x80000000));
934
935
0
            int _imask1 =_mm512_cmpgt_epi32_mask
936
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
937
0
            int _imask2 = _mm512_cmpgt_epi32_mask
938
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
939
940
0
            __m512i renorm_words1 = _mm512_cvtepu16_epi32
941
0
                (_mm256_loadu_si256((const __m256i *)sp));
942
0
            sp += _mm_popcnt_u32(_imask1);
943
944
0
            __m512i renorm_words2 = _mm512_cvtepu16_epi32
945
0
                (_mm256_loadu_si256((const __m256i *)sp));
946
0
            sp += _mm_popcnt_u32(_imask2);
947
948
0
            __m512i _renorm_vals1 =
949
0
                _mm512_maskz_expand_epi32(_imask1, renorm_words1);
950
0
            __m512i _renorm_vals2 =
951
0
                _mm512_maskz_expand_epi32(_imask2, renorm_words2);
952
953
0
            _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
954
0
            _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
955
956
0
            _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
957
0
            _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
958
959
0
#ifdef TBUF8
960
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
961
0
                             _mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
962
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
963
0
                             _mm512_cvtepi32_epi8(_Sv2));
964
#else
965
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
966
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
967
#endif
968
969
0
            iN[0]++;
970
0
            if (++tidx == 32) {
971
0
                iN[0]-=32;
972
973
                // We have tidx[x][y] which we want to store in
974
                // memory in out[y][z] instead.  This is an unrolled
975
                // transposition.
976
0
#ifdef TBUF8
977
0
                transpose_and_copy(out, iN, u.tbuf);
978
#else
979
                transpose_and_copy_avx512(out, iN, tbuf);
980
#endif
981
0
                tidx = 0;
982
0
            }
983
0
        }
984
0
        isz4 += 64;
985
986
0
        STORE512(_Rv, R);
987
0
        STORE512(_Lv, lN);
988
0
        ptr = (uint8_t *)sp;
989
990
0
        if (1) {
991
0
            iN[0]-=tidx;
992
0
            int T;
993
0
            for (z = 0; z < NX; z++)
994
0
                for (T = 0; T < tidx; T++)
995
0
#ifdef TBUF8
996
0
                    out[iN[z]++] = u.tbuf[T][z];
997
#else
998
                    out[iN[z]++] = tbuf[T][z];
999
#endif
1000
0
        }
1001
1002
        // Scalar version for close to the end of in[] array so we don't
1003
        // do SIMD loads beyond the end of the buffer
1004
0
        for (; iN[0] < isz4;) {
1005
0
            for (z = 0; z < NX; z++) {
1006
0
                uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
1007
0
                uint32_t S = s3[lN[z]][m];
1008
0
                unsigned char c = S & 0xff;
1009
0
                out[iN[z]++] = c;
1010
0
                uint32_t F = S>>(TF_SHIFT_O1+8);
1011
0
                R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
1012
0
                    ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
1013
0
                RansDecRenormSafe(&R[z], &ptr, ptr_end);
1014
0
                lN[z] = c;
1015
0
            }
1016
0
        }
1017
1018
        // Remainder
1019
0
        z = NX-1;
1020
0
        for (; iN[z] < out_sz; ) {
1021
0
            uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
1022
0
            uint32_t S = s3[lN[z]][m];
1023
0
            unsigned char c = S & 0xff;
1024
0
            out[iN[z]++] = c;
1025
0
            uint32_t F = S>>(TF_SHIFT_O1+8);
1026
0
            R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
1027
0
                ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
1028
0
            RansDecRenormSafe(&R[z], &ptr, ptr_end);
1029
0
            lN[z] = c;
1030
0
        }
1031
0
    } else {
1032
        // TF_SHIFT_O1_FAST.  This is the most commonly used variant.
1033
1034
        // SIMD version ends decoding early as it reads at most 64 bytes
1035
        // from input via 4 vectorised loads.
1036
0
        isz4 -= 64;
1037
0
        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
1038
            // m[z] = R[z] & mask;
1039
0
            __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
1040
0
            __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
1041
1042
            //  S[z] = s3[lN[z]][m[z]];
1043
0
            _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1_FAST);
1044
0
            _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1_FAST);
1045
1046
0
            _masked1 = _mm512_add_epi32(_masked1, _Lv1);
1047
0
            _masked2 = _mm512_add_epi32(_masked2, _Lv2);
1048
1049
            // This is the biggest bottleneck
1050
0
            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
1051
0
                                                   sizeof(s3F[0][0]));
1052
0
            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
1053
0
                                                   sizeof(s3F[0][0]));
1054
1055
            //  f[z] = S[z]>>(TF_SHIFT_O1+8);
1056
0
            __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8);
1057
0
            __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8);
1058
1059
            //  b[z] = (S[z]>>8) & mask;
1060
0
            __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
1061
0
            __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
1062
1063
            //  s[z] = S[z] & 0xff;
1064
0
            __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
1065
0
            __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
1066
1067
            //  R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
1068
0
            _Rv1 = _mm512_add_epi32(
1069
0
                       _mm512_mullo_epi32(
1070
0
                           _mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST),
1071
0
                           _fv1), _bv1);
1072
0
            _Rv2 = _mm512_add_epi32(
1073
0
                       _mm512_mullo_epi32(
1074
0
                           _mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST),
1075
0
                           _fv2), _bv2);
1076
1077
            //for (z = 0; z < NX; z++) lN[z] = c[z];
1078
0
            _Lv1 = _sv1;
1079
0
            _Lv2 = _sv2;
1080
1081
            // RansDecRenorm(&R[z], &ptr);
1082
0
            __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
1083
0
                                        _mm512_set1_epi32(0x80000000));
1084
0
            __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
1085
0
                                        _mm512_set1_epi32(0x80000000));
1086
1087
0
            int _imask1 =_mm512_cmpgt_epi32_mask
1088
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
1089
0
            int _imask2 = _mm512_cmpgt_epi32_mask
1090
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
1091
1092
0
            __m512i renorm_words1 = _mm512_cvtepu16_epi32
1093
0
                (_mm256_loadu_si256((const __m256i *)sp));
1094
0
            sp += _mm_popcnt_u32(_imask1);
1095
1096
0
            __m512i renorm_words2 = _mm512_cvtepu16_epi32
1097
0
                (_mm256_loadu_si256((const __m256i *)sp));
1098
0
            sp += _mm_popcnt_u32(_imask2);
1099
1100
0
            __m512i _renorm_vals1 =
1101
0
                _mm512_maskz_expand_epi32(_imask1, renorm_words1);
1102
0
            __m512i _renorm_vals2 =
1103
0
                _mm512_maskz_expand_epi32(_imask2, renorm_words2);
1104
1105
0
            _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
1106
0
            _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
1107
1108
0
            _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
1109
0
            _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
1110
1111
0
#ifdef TBUF8
1112
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
1113
0
                             _mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
1114
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
1115
0
                             _mm512_cvtepi32_epi8(_Sv2));
1116
#else
1117
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
1118
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
1119
#endif
1120
1121
0
            iN[0]++;
1122
0
            if (++tidx == 32) {
1123
0
                iN[0]-=32;
1124
0
#ifdef TBUF8
1125
0
                transpose_and_copy(out, iN, u.tbuf);
1126
#else
1127
                transpose_and_copy_avx512(out, iN, tbuf);
1128
#endif
1129
0
                tidx = 0;
1130
0
            }
1131
0
        }
1132
0
        isz4 += 64;
1133
1134
0
        STORE512(_Rv, R);
1135
0
        STORE512(_Lv, lN);
1136
0
        ptr = (uint8_t *)sp;
1137
1138
0
        if (1) {
1139
0
            iN[0]-=tidx;
1140
0
            int T;
1141
0
            for (z = 0; z < NX; z++)
1142
0
                for (T = 0; T < tidx; T++)
1143
0
#ifdef TBUF8
1144
0
                    out[iN[z]++] = u.tbuf[T][z];
1145
#else
1146
                    out[iN[z]++] = tbuf[T][z];
1147
#endif
1148
0
        }
1149
1150
        // Scalar version for close to the end of in[] array so we don't
1151
        // do SIMD loads beyond the end of the buffer
1152
0
        for (; iN[0] < isz4;) {
1153
0
            for (z = 0; z < NX; z++) {
1154
0
                uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
1155
0
                uint32_t S = s3F[lN[z]][m];
1156
0
                unsigned char c = S & 0xff;
1157
0
                out[iN[z]++] = c;
1158
0
                R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
1159
0
                    ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
1160
0
                RansDecRenormSafe(&R[z], &ptr, ptr_end);
1161
0
                lN[z] = c;
1162
0
            }
1163
0
        }
1164
1165
        // Remainder
1166
0
        z = NX-1;
1167
0
        for (; iN[z] < out_sz; ) {
1168
0
            uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
1169
0
            uint32_t S = s3F[lN[z]][m];
1170
0
            unsigned char c = S & 0xff;
1171
0
            out[iN[z]++] = c;
1172
0
            R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
1173
0
                ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
1174
0
            RansDecRenormSafe(&R[z], &ptr, ptr_end);
1175
0
            lN[z] = c;
1176
0
        }
1177
0
    }
1178
1179
0
    htscodecs_tls_free(s3);
1180
0
    return out;
1181
1182
0
 err:
1183
0
    htscodecs_tls_free(s3);
1184
0
    free(out_free);
1185
0
    free(c_freq);
1186
1187
    return NULL;
1188
0
}
1189
#else  // HAVE_AVX512
1190
// Prevent "empty translation unit" errors when building without AVX512
1191
const char *rANS_static32x16pr_avx512_disabled = "No AVX512";
1192
#endif // HAVE_AVX512