Coverage Report

Created: 2023-01-17 06:24

/src/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017-2021 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 *
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 *
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *       Institute nor the names of its contributors may be used to endorse
18
 *       or promote products derived from this software without specific
19
 *       prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
/*
35
 * This is an AVX512 implementation of the 32-way interleaved 16-bit rANS.
36
 * For now it only contains an order-0 implementation.  The AVX2 code may
37
 * be used for order-1.
38
 */
39
40
#include "config.h"
41
42
#ifdef HAVE_AVX512
43
44
#include <stdint.h>
45
#include <stdlib.h>
46
#include <stdio.h>
47
#include <string.h>
48
#include <limits.h>
49
#include <x86intrin.h>
50
51
#include "rANS_word.h"
52
#include "rANS_static4x16.h"
53
#define ROT32_SIMD
54
#include "rANS_static16_int.h"
55
#include "varint.h"
56
#include "utils.h"
57
58
unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
59
                                             unsigned int in_size,
60
                                             unsigned char *out,
61
0
                                             unsigned int *out_size) {
62
0
    unsigned char *cp, *out_end;
63
0
    RansEncSymbol syms[256];
64
0
    RansState ransN[32] __attribute__((aligned(64)));
65
0
    uint8_t* ptr;
66
0
    uint32_t F[256+MAGIC] = {0};
67
0
    int i, j, tab_size = 0, x, z;
68
    // -20 for order/size/meta
69
0
    uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;
70
71
0
    if (!out) {
72
0
        *out_size = bound;
73
0
        out = malloc(*out_size);
74
0
    }
75
0
    if (!out || bound > *out_size)
76
0
        return NULL;
77
78
    // If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
79
    // We already added more round in bound to allow for this.
80
0
    if (((size_t)out)&1)
81
0
        bound--;
82
0
    ptr = out_end = out + bound;
83
84
0
    if (in_size == 0)
85
0
        goto empty;
86
87
    // Compute statistics
88
0
    hist8(in, in_size, F);
89
90
    // Normalise so frequences sum to power of 2
91
0
    uint32_t fsum = in_size;
92
0
    uint32_t max_val = round2(fsum);
93
0
    if (max_val > TOTFREQ)
94
0
        max_val = TOTFREQ;
95
96
0
    if (normalise_freq(F, fsum, max_val) < 0)
97
0
        return NULL;
98
0
    fsum=max_val;
99
100
0
    cp = out;
101
0
    cp += encode_freq(cp, F);
102
0
    tab_size = cp-out;
103
    //write(2, out+4, cp-(out+4));
104
105
0
    if (normalise_freq(F, fsum, TOTFREQ) < 0)
106
0
        return NULL;
107
108
    // Encode statistics and build lookup tables for SIMD encoding.
109
0
    uint32_t SB[256], SA[256], SD[256], SC[256];
110
0
    for (x = j = 0; j < 256; j++) {
111
0
        if (F[j]) {
112
0
            RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
113
0
            SB[j] = syms[j].x_max;
114
0
            SA[j] = syms[j].rcp_freq;
115
0
            SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16);
116
0
            SC[j] = syms[j].bias;
117
0
            x += F[j];
118
0
        }
119
0
    }
120
121
0
    for (z = 0; z < 32; z++)
122
0
      RansEncInit(&ransN[z]);
123
124
0
    z = i = in_size&(32-1);
125
0
    while (z-- > 0)
126
0
      RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]);
127
128
0
#define LOAD512(a,b)                                     \
129
0
    __m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \
130
0
    __m512i a##2 = _mm512_load_si512((__m512i *)&b[16]);
131
132
0
#define STORE512(a,b)                           \
133
0
    _mm512_store_si512((__m256i *)&b[0], a##1); \
134
0
    _mm512_store_si512((__m256i *)&b[16], a##2);
135
136
0
    LOAD512(Rv, ransN);
137
138
0
    uint16_t *ptr16 = (uint16_t *)ptr;
139
0
    for (i=(in_size &~(32-1)); i>0; i-=32) {
140
0
        uint8_t *c = &in[i-32];
141
142
        // GATHER versions
143
        // Much faster now we have an efficient loadu mechanism in place,
144
        // BUT...
145
        // Try this for avx2 variant too?  Better way to populate the mm256
146
        // regs for mix of avx2 and avx512 opcodes.
147
0
        __m256i c12 = _mm256_loadu_si256((__m256i const *)c);
148
0
        __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0));
149
0
        __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1));
150
0
#define SET512(a,b) \
151
0
        __m512i a##1 = _mm512_i32gather_epi32(c1, b, 4); \
152
0
        __m512i a##2 = _mm512_i32gather_epi32(c2, b, 4)
153
154
0
        SET512(xmax, SB);
155
156
0
        uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
157
0
        int pc1 = _mm_popcnt_u32(gt_mask1);
158
0
        __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
159
0
        __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
160
0
        uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
161
0
        SET512(SDv,  SD);
162
0
        int pc2 = _mm_popcnt_u32(gt_mask2);
163
164
        //Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
165
0
        Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
166
0
        Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
167
168
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
169
0
        ptr16 -= pc2;
170
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
171
0
        ptr16 -= pc1;
172
173
0
        SET512(rfv,  SA);
174
0
        Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
175
0
        Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
176
177
        // interleaved form of this, helps on icc a bit
178
        //rfv1 = _mm512_mulhi_epu32(Rv1, rfv1);
179
        //rfv2 = _mm512_mulhi_epu32(Rv2, rfv2);
180
181
        // Alternatives here:
182
        // SHIFT right/left instead of AND: (very marginally slower)
183
        //   rf1_hm = _mm512_and_epi32(
184
        //           rf1_hm, _mm512_set1_epi64((uint64_t)0xffffffff00000000));
185
        // vs
186
        //   rf1_hm = _mm512_srli_epi64(rf1_hm, 32);
187
        //   rf1_hm = _mm512_slli_epi64(rf1_hm, 32);
188
0
        __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1,  32),
189
0
                                          _mm512_srli_epi64(rfv1, 32));
190
0
        __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2,  32),
191
0
                                          _mm512_srli_epi64(rfv2, 32));
192
0
        __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
193
0
        __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
194
0
        rf1_hm = _mm512_and_epi32(rf1_hm,
195
0
                                  _mm512_set1_epi64((uint64_t)0xffffffff<<32));
196
0
        rf2_hm = _mm512_and_epi32(rf2_hm,
197
0
                                  _mm512_set1_epi64((uint64_t)0xffffffff<<32));
198
0
        rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
199
0
        rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
200
201
        // Or a pure masked blend approach, sadly slower.
202
        // rfv1 = _mm512_mask_blend_epi32(0x5555,
203
        //            _mm512_mul_epu32(
204
        //                _mm512_srli_epi64(Rv1, 32),
205
        //                _mm512_srli_epi64(rfv1, 32)),
206
        //            _mm512_srli_epi64(
207
        //                _mm512_mul_epu32(Rv1, rfv1),
208
        //                32));
209
        // rfv2 = _mm512_mask_blend_epi32(0xaaaa,
210
        //            _mm512_srli_epi64(
211
        //                _mm512_mul_epu32(Rv2, rfv2),
212
        //                32),
213
        //            _mm512_mul_epu32(
214
        //                _mm512_srli_epi64(Rv2, 32),
215
        //                _mm512_srli_epi64(rfv2, 32)));
216
217
0
        SET512(biasv, SC);
218
0
        __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
219
0
        __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
220
221
0
        __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
222
0
        __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
223
224
0
        qv1 = _mm512_mullo_epi32(qv1,
225
0
                  _mm512_and_si512(SDv1,_mm512_set1_epi32(0xffff)));
226
0
        qv1 = _mm512_add_epi32(qv1, biasv1);
227
0
        Rv1 = _mm512_add_epi32(Rv1, qv1);
228
229
0
        qv2 = _mm512_mullo_epi32(qv2,
230
0
                  _mm512_and_si512(SDv2, _mm512_set1_epi32(0xffff)));
231
0
        qv2 = _mm512_add_epi32(qv2, biasv2);
232
0
        Rv2 = _mm512_add_epi32(Rv2, qv2);
233
0
    }
234
0
    ptr = (uint8_t *)ptr16;
235
0
    STORE512(Rv, ransN);
236
237
0
    for (z = 32-1; z >= 0; z--)
238
0
        RansEncFlush(&ransN[z], &ptr);
239
    
240
0
 empty:
241
    // Finalise block size and return it
242
0
    *out_size = (out_end - ptr) + tab_size;
243
244
0
    memmove(out + tab_size, ptr, out_end-ptr);
245
246
0
    return out;
247
0
}
248
249
unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
250
                                               unsigned int in_size,
251
                                               unsigned char *out,
252
0
                                               unsigned int out_sz) {
253
0
    if (in_size < 32*4) // 32-states at least
254
0
        return NULL;
255
256
0
    if (out_sz >= INT_MAX)
257
0
        return NULL; // protect against some overflow cases
258
259
    /* Load in the static tables */
260
0
    unsigned char *cp = in, *out_free = NULL;
261
0
    unsigned char *cp_end = in + in_size;
262
0
    int i;
263
0
    uint32_t s3[TOTFREQ]  __attribute__((aligned(64))); // For TF_SHIFT <= 12
264
265
0
    if (!out)
266
0
        out_free = out = malloc(out_sz);
267
0
    if (!out)
268
0
        return NULL;
269
270
    // Precompute reverse lookup of frequency.
271
0
    uint32_t F[256] = {0}, fsum;
272
0
    int fsz = decode_freq(cp, cp_end, F, &fsum);
273
0
    if (!fsz)
274
0
        goto err;
275
0
    cp += fsz;
276
277
0
    normalise_freq_shift(F, fsum, TOTFREQ);
278
279
    // Build symbols; fixme, do as part of decode, see the _d variant
280
0
    if (rans_F_to_s3(F, TF_SHIFT, s3))
281
0
        goto err;
282
283
0
    if (cp_end - cp < 32 * 4)
284
0
        goto err;
285
286
0
    int z;
287
0
    RansState Rv[32] __attribute__((aligned(64)));
288
0
    for (z = 0; z < 32; z++) {
289
0
        RansDecInit(&Rv[z], &cp);
290
0
        if (Rv[z] < RANS_BYTE_L)
291
0
            goto err;
292
0
    }
293
294
0
    uint16_t *sp = (uint16_t *)cp;
295
296
0
    int out_end = (out_sz&~(32-1));
297
0
    const uint32_t mask = (1u << TF_SHIFT)-1;
298
299
0
    __m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes
300
0
    __m512i R1 = _mm512_load_epi32(&Rv[0]);
301
0
    __m512i R2 = _mm512_load_epi32(&Rv[16]);
302
303
    // Start of the first loop iteration, which we do move to the end of the
304
    // loop for the next cycle so we can remove some of the instr. latency.
305
0
    __m512i masked1 = _mm512_and_epi32(R1, maskv);
306
0
    __m512i masked2 = _mm512_and_epi32(R2, maskv);
307
0
    __m512i S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
308
0
    __m512i S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
309
310
0
    uint8_t overflow[64+64] = {0};
311
0
    for (i=0; i < out_end; i+=32) {
312
      //for (z = 0; z < 16; z++) {
313
314
      // Protect against running off the end of in buffer.
315
      // We copy it to a worst-case local buffer when near the end.
316
0
      if ((uint8_t *)sp+64 > cp_end) {
317
0
        memmove(overflow, sp, cp_end - (uint8_t *)sp);
318
0
        sp = (uint16_t *)overflow;
319
0
        cp_end = overflow + sizeof(overflow);
320
0
      }
321
322
      //uint32_t S = s3[R[z] & mask];
323
0
      __m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words
324
325
      //uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask;
326
0
      __m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8);
327
0
      __m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8);
328
0
      __m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv);
329
0
      __m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv);
330
331
      //R[z] = f * (R[z] >> TF_SHIFT) + b;
332
      // approx 10 cycle latency on mullo.
333
0
      R1 = _mm512_add_epi32(
334
0
               _mm512_mullo_epi32(
335
0
                   _mm512_srli_epi32(R1, TF_SHIFT), f1), b1);
336
0
      R2 = _mm512_add_epi32(
337
0
               _mm512_mullo_epi32(
338
0
                   _mm512_srli_epi32(R2, TF_SHIFT), f2), b2);
339
340
      // renorm. this is the interesting part:
341
0
      __mmask16 renorm_mask1, renorm_mask2;
342
0
      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
343
0
      renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
344
      // advance by however many words we actually read
345
0
      sp += _mm_popcnt_u32(renorm_mask1);
346
0
      __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256(
347
0
                                      (const __m256i *)sp));
348
349
      // select masked only
350
0
      __m512i renorm_vals1, renorm_vals2;
351
0
      renorm_vals1 = _mm512_maskz_expand_epi32(renorm_mask1, renorm_words1);
352
0
      renorm_vals2 = _mm512_maskz_expand_epi32(renorm_mask2, renorm_words2);
353
      // shift & add selected words
354
0
      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
355
0
      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
356
0
      R1 = _mm512_add_epi32(R1, renorm_vals1);
357
0
      R2 = _mm512_add_epi32(R2, renorm_vals2);
358
359
      // For start of next loop iteration.  This has been moved here
360
      // (and duplicated to before the loop starts) so we can do something
361
      // with the latency period of gather, such as finishing up the
362
      // renorm offset and writing the results. 
363
0
      __m512i S1_ = S1; // temporary copy for use in out[]=S later
364
0
      __m512i S2_ = S2;
365
366
0
      masked1 = _mm512_and_epi32(R1, maskv);
367
0
      masked2 = _mm512_and_epi32(R2, maskv);
368
      // Gather is slow bit (half total time) - 30 cycle latency.
369
0
      S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
370
0
      S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
371
372
      // advance by however many words we actually read
373
0
      sp += _mm_popcnt_u32(renorm_mask2);
374
375
      //out[i+z] = S;
376
0
      _mm_storeu_si128((__m128i *)(out+i),    _mm512_cvtepi32_epi8(S1_));
377
0
      _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_));
378
0
    }      
379
380
0
    _mm512_store_epi32(&Rv[ 0], R1);
381
0
    _mm512_store_epi32(&Rv[16], R2);
382
383
0
    for (z = out_sz & (32-1); z-- > 0; )
384
0
      out[out_end + z] = s3[Rv[z] & mask];
385
386
0
    return out;
387
388
0
 err:
389
0
    free(out_free);
390
0
    return NULL;
391
0
}
392
393
#define TBUF8
394
#ifdef TBUF8
395
// 15% quicker overall O1 decode now due to rot32_simd below.
396
397
// NB: This uses AVX2 though and we could rewrite using AVX512 for
398
// further speed gains.
399
static inline void transpose_and_copy(uint8_t *out, int iN[32],
400
0
                                      uint8_t t[32][32]) {
401
//  int z;
402
//  for (z = 0; z < 32; z++) {
403
//      int k;
404
//      for (k = 0; k < 32; k++)
405
//          out[iN[z]+k] = t[k][z];
406
//      iN[z] += 32;
407
//  }
408
409
0
    rot32_simd(t, out, iN);
410
0
}
411
412
#else
413
// Implemented using AVX512 gathers.
414
// This is faster than a naive scalar implementation, but doesn't beat the
415
// AVX2 vectorised 32x32 transpose function.
416
static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32],
417
                                             uint32_t t32[32][32]) {
418
    int z;
419
//  for (z = 0; z < 32; z++) {
420
//      int k;
421
//      for (k = 0; k < 32; k++)
422
//          out[iN[z]+k] = t32[k][z];
423
//      iN[z] += 32;
424
//  }
425
426
    
427
    __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10,  9,  8,
428
                                   7,  6,  5,  4,  3,  2,  1,  0);
429
    v1 = _mm512_slli_epi32(v1, 5);
430
    
431
    for (z = 0; z < 32; z++) {
432
        __m512i t1 = _mm512_i32gather_epi32(v1, &t32[ 0][z], 4);
433
        __m512i t2 = _mm512_i32gather_epi32(v1, &t32[16][z], 4);
434
        _mm_storeu_si128((__m128i*)(&out[iN[z]   ]), _mm512_cvtepi32_epi8(t1));
435
        _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2));
436
        iN[z] += 32;
437
    }
438
}
439
#endif // TBUF
440
441
unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
442
                                             unsigned int in_size,
443
                                             unsigned char *out,
444
0
                                             unsigned int *out_size) {
445
0
    unsigned char *cp, *out_end, *out_free = NULL;
446
0
    unsigned int tab_size;
447
0
    uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
448
0
    int z;
449
0
    RansState ransN[32] __attribute__((aligned(64)));
450
451
0
    if (in_size < 32) // force O0 instead
452
0
        return NULL;
453
454
0
    if (!out) {
455
0
        *out_size = bound;
456
0
        out = malloc(*out_size);
457
0
    }
458
0
    if (!out || bound > *out_size)
459
0
        return NULL;
460
461
0
    if (((size_t)out)&1)
462
0
        bound--;
463
0
    out_end = out + bound;
464
465
0
    RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms)));
466
0
    if (!syms) {
467
0
        free(out_free);
468
0
        return NULL;
469
0
    }
470
471
0
    cp = out;
472
0
    int shift = encode_freq1(in, in_size, 32, syms, &cp); 
473
0
    if (shift < 0) {
474
0
        free(out_free);
475
0
        htscodecs_tls_free(syms);
476
0
        return NULL;
477
0
    }
478
0
    tab_size = cp - out;
479
480
0
    for (z = 0; z < 32; z++)
481
0
      RansEncInit(&ransN[z]);
482
483
0
    uint8_t* ptr = out_end;
484
485
0
    int iN[32], isz4 = in_size/32;
486
0
    for (z = 0; z < 32; z++)
487
0
        iN[z] = (z+1)*isz4-2;
488
489
0
    uint32_t lN[32] __attribute__((aligned(64)));
490
0
    for (z = 0; z < 32; z++)
491
0
        lN[z] = in[iN[z]+1];
492
493
    // Deal with the remainder
494
0
    z = 32-1;
495
0
    lN[z] = in[in_size-1];
496
0
    for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) {
497
0
        unsigned char c = in[iN[z]];
498
0
        RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]);
499
0
        lN[z] = c;
500
0
    }
501
502
0
    LOAD512(Rv, ransN);
503
504
0
    uint16_t *ptr16 = (uint16_t *)ptr;
505
0
    __m512i last2 = _mm512_set_epi32(lN[31], lN[30], lN[29], lN[28],
506
0
                                     lN[27], lN[26], lN[25], lN[24],
507
0
                                     lN[23], lN[22], lN[21], lN[20],
508
0
                                     lN[19], lN[18], lN[17], lN[16]);
509
0
    __m512i last1 = _mm512_set_epi32(lN[15], lN[14], lN[13], lN[12],
510
0
                                     lN[11], lN[10], lN[ 9], lN[ 8],
511
0
                                     lN[ 7], lN[ 6], lN[ 5], lN[ 4],
512
0
                                     lN[ 3], lN[ 2], lN[ 1], lN[ 0]);
513
    
514
0
    __m512i iN2 = _mm512_set_epi32(iN[31], iN[30], iN[29], iN[28],
515
0
                                   iN[27], iN[26], iN[25], iN[24],
516
0
                                   iN[23], iN[22], iN[21], iN[20],
517
0
                                   iN[19], iN[18], iN[17], iN[16]);
518
0
    __m512i iN1 = _mm512_set_epi32(iN[15], iN[14], iN[13], iN[12],
519
0
                                   iN[11], iN[10], iN[ 9], iN[ 8],
520
0
                                   iN[ 7], iN[ 6], iN[ 5], iN[ 4],
521
0
                                   iN[ 3], iN[ 2], iN[ 1], iN[ 0]);
522
523
0
    __m512i c1 = _mm512_i32gather_epi32(iN1, in, 1);
524
0
    __m512i c2 = _mm512_i32gather_epi32(iN2, in, 1);
525
526
0
    for (; iN[0] >= 0; iN[0]--) {
527
        // Note, consider doing the same approach for the AVX2 encoder.
528
        // Maybe we can also get gather working well there?
529
        // Gather here is still a major latency bottleneck, consuming
530
        // around 40% of CPU cycles overall.
531
532
        // FIXME: maybe we need to cope with in[31] read over-flow
533
        // on loop cycles 0, 1, 2 where gather reads 32-bits instead of
534
        // 8 bits.  Use set instead there on c2?
535
0
        c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
536
0
        c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
537
538
        // index into syms[0][0] array, used for x_max, rcp_freq, and bias
539
0
        __m512i vidx1 = _mm512_slli_epi32(c1, 8);
540
0
        __m512i vidx2 = _mm512_slli_epi32(c2, 8);
541
0
        vidx1 = _mm512_add_epi32(vidx1, last1);
542
0
        vidx2 = _mm512_add_epi32(vidx2, last2);
543
0
        vidx1 = _mm512_slli_epi32(vidx1, 2);
544
0
        vidx2 = _mm512_slli_epi32(vidx2, 2);
545
546
        // ------------------------------------------------------------
547
        //      for (z = NX-1; z >= 0; z--) {
548
        //          if (ransN[z] >= x_max[z]) {
549
        //              *--ptr16 = ransN[z] & 0xffff;
550
        //              ransN[z] >>= 16;
551
        //          }
552
        //      }
553
554
0
#define SET512x(a,x) \
555
0
        __m512i a##1 = _mm512_i32gather_epi32(vidx1, &syms[0][0].x, 4); \
556
0
        __m512i a##2 = _mm512_i32gather_epi32(vidx2, &syms[0][0].x, 4)
557
558
        // Start of next loop, moved here to remove latency.
559
        // last[z] = c[z]
560
        // iN[z]--
561
        // c[z] = in[iN[z]]
562
0
        last1 = c1;
563
0
        last2 = c2;
564
0
        iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1));
565
0
        iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1));
566
0
        c1 = _mm512_i32gather_epi32(iN1, in, 1);
567
0
        c2 = _mm512_i32gather_epi32(iN2, in, 1);
568
569
0
        SET512x(xmax, x_max); // high latency
570
571
0
        uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
572
0
        int pc1 = _mm_popcnt_u32(gt_mask1);
573
0
        __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
574
0
        __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
575
0
        uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
576
0
        SET512x(SDv, cmpl_freq); // good
577
0
        int pc2 = _mm_popcnt_u32(gt_mask2);
578
579
0
        Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
580
0
        Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
581
582
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
583
0
        ptr16 -= pc2;
584
0
        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
585
0
        ptr16 -= pc1;
586
587
0
        Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
588
0
        Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
589
590
        // ------------------------------------------------------------
591
        // uint32_t q = (uint32_t) (((uint64_t)ransN[z] * rcp_freq[z])
592
        //                          >> rcp_shift[z]);
593
        // ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z];
594
0
        SET512x(rfv, rcp_freq); // good-ish
595
596
0
        __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32),
597
0
                                          _mm512_srli_epi64(rfv1, 32));
598
0
        __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32),
599
0
                                          _mm512_srli_epi64(rfv2, 32));
600
0
        __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
601
0
        __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
602
603
0
        const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000);
604
0
        rf1_hm = _mm512_and_epi32(rf1_hm, top32);
605
0
        rf2_hm = _mm512_and_epi32(rf2_hm, top32);
606
0
        rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
607
0
        rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
608
609
0
        SET512x(biasv, bias); // good
610
0
        __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
611
0
        __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
612
613
0
        shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32));
614
0
        shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32));
615
616
0
        __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
617
0
        __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
618
619
0
        const __m512i bot16 = _mm512_set1_epi32(0xffff);
620
0
        qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16));
621
0
        qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16));
622
623
0
        qv1 = _mm512_add_epi32(qv1, biasv1);
624
0
        Rv1 = _mm512_add_epi32(Rv1, qv1);
625
626
0
        qv2 = _mm512_add_epi32(qv2, biasv2);
627
0
        Rv2 = _mm512_add_epi32(Rv2, qv2);
628
0
    }
629
630
0
    STORE512(Rv, ransN);
631
0
    STORE512(last, lN);
632
633
0
    ptr = (uint8_t *)ptr16;
634
635
0
    for (z = 32-1; z>=0; z--)
636
0
        RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
637
638
0
    for (z = 32-1; z >= 0; z--)
639
0
        RansEncFlush(&ransN[z], &ptr);
640
641
    // Finalise block size and return it
642
0
    *out_size = (out_end - ptr) + tab_size;
643
644
//    cp = out;
645
//    *cp++ = (in_size>> 0) & 0xff;
646
//    *cp++ = (in_size>> 8) & 0xff;
647
//    *cp++ = (in_size>>16) & 0xff;
648
//    *cp++ = (in_size>>24) & 0xff;
649
650
0
    memmove(out + tab_size, ptr, out_end-ptr);
651
652
0
    htscodecs_tls_free(syms);
653
0
    return out;
654
0
}
655
656
0
#define NX 32
657
unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
658
                                               unsigned int in_size,
659
                                               unsigned char *out,
660
0
                                               unsigned int out_sz) {
661
0
    if (in_size < NX*4) // 4-states at least
662
0
        return NULL;
663
664
0
    if (out_sz >= INT_MAX)
665
0
        return NULL; // protect against some overflow cases
666
667
    /* Load in the static tables */
668
0
    unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
669
0
    unsigned char *c_freq = NULL;
670
671
0
    uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4);
672
0
    if (!s3)
673
0
        return NULL;
674
0
    uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3;
675
676
0
    if (!out)
677
0
        out_free = out = malloc(out_sz);
678
679
0
    if (!out)
680
0
        goto err;
681
682
    //fprintf(stderr, "out_sz=%d\n", out_sz);
683
684
    // compressed header? If so uncompress it
685
0
    unsigned char *tab_end = NULL;
686
0
    unsigned char *c_freq_end = cp_end;
687
0
    unsigned int shift = *cp >> 4;
688
0
    if (*cp++ & 1) {
689
0
        uint32_t u_freq_sz, c_freq_sz;
690
0
        cp += var_get_u32(cp, cp_end, &u_freq_sz);
691
0
        cp += var_get_u32(cp, cp_end, &c_freq_sz);
692
0
        if (c_freq_sz >= cp_end - cp - 16)
693
0
            goto err;
694
0
        tab_end = cp + c_freq_sz;
695
0
        if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL,
696
0
                                               u_freq_sz)))
697
0
            goto err;
698
0
        cp = c_freq;
699
0
        c_freq_end = c_freq + u_freq_sz;
700
0
    }
701
702
    // Decode order-0 symbol list; avoids needing in order-1 tables
703
0
    cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL);
704
705
0
    if (tab_end)
706
0
        cp = tab_end;
707
0
    free(c_freq);
708
0
    c_freq = NULL;
709
710
0
    if (cp_end - cp < NX * 4)
711
0
        goto err;
712
713
0
    RansState R[NX] __attribute__((aligned(64)));
714
0
    uint8_t *ptr = cp, *ptr_end = in + in_size;
715
0
    int z;
716
0
    for (z = 0; z < NX; z++) {
717
0
        RansDecInit(&R[z], &ptr);
718
0
        if (R[z] < RANS_BYTE_L)
719
0
            goto err;
720
0
    }
721
722
0
    int isz4 = out_sz/NX;
723
0
    int iN[NX], lN[NX] __attribute__((aligned(64))) = {0};
724
0
    for (z = 0; z < NX; z++)
725
0
        iN[z] = z*isz4;
726
727
0
    uint16_t *sp = (uint16_t *)ptr;
728
0
    const uint32_t mask = (1u << shift)-1;
729
730
0
    __m512i _maskv  = _mm512_set1_epi32(mask);
731
0
    LOAD512(_Rv, R);
732
0
    LOAD512(_Lv, lN);
733
734
0
#ifdef TBUF8
735
0
    union {
736
0
        unsigned char tbuf[32][32];
737
0
        uint64_t tbuf64[32][4];
738
0
    } u;
739
#else
740
    uint32_t tbuf[32][32];
741
#endif
742
743
0
    unsigned int tidx = 0;
744
745
0
    if (shift == TF_SHIFT_O1) {
746
0
        isz4 -= 64;
747
0
        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
748
            // m[z] = R[z] & mask;
749
0
            __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
750
0
            __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
751
752
            //  S[z] = s3[lN[z]][m[z]];
753
0
            _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1);
754
0
            _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1);
755
756
0
            _masked1 = _mm512_add_epi32(_masked1, _Lv1);
757
0
            _masked2 = _mm512_add_epi32(_masked2, _Lv2);
758
759
            // This is the biggest bottleneck
760
0
            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
761
0
                                                  sizeof(s3F[0][0]));
762
0
            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
763
0
                                                  sizeof(s3F[0][0]));
764
765
            //  f[z] = S[z]>>(TF_SHIFT_O1+8);
766
0
            __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8);
767
0
            __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8);
768
769
            //  b[z] = (S[z]>>8) & mask;
770
0
            __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
771
0
            __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
772
773
            //  s[z] = S[z] & 0xff;
774
0
            __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
775
0
            __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
776
777
            // A maximum frequency of 4096 doesn't fit in our s3 array.
778
            // as it's 12 bit + 12 bit + 8 bit.  It wraps around to zero.
779
            // (We don't have this issue for TOTFREQ_O1_FAST.)
780
            //
781
            // Solution 1 is to change to spec to forbid freq of 4096.
782
            // Easy hack is to add an extra symbol so it sums correctly.
783
            // => 572 MB/s on q40 (deskpro).
784
            //
785
            // Solution 2 implemented here is to look for the wrap around
786
            // and fix it.
787
            // => 556 MB/s on q40
788
            // cope with max freq of 4096.  Only 3% hit
789
0
            __m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1);
790
0
            __m512i zero = _mm512_setzero_si512();
791
0
            __mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero);
792
0
            __mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero);
793
0
            _fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq);
794
0
            _fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq);
795
796
            //  R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
797
0
            _Rv1 = _mm512_add_epi32(
798
0
                       _mm512_mullo_epi32(
799
0
                           _mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1);
800
0
            _Rv2 = _mm512_add_epi32(
801
0
                       _mm512_mullo_epi32(
802
0
                           _mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2);
803
804
            //for (z = 0; z < NX; z++) lN[z] = c[z];
805
0
            _Lv1 = _sv1;
806
0
            _Lv2 = _sv2;
807
808
            // RansDecRenorm(&R[z], &ptr);
809
0
            __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
810
0
                                        _mm512_set1_epi32(0x80000000));
811
0
            __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
812
0
                                        _mm512_set1_epi32(0x80000000));
813
814
0
            int _imask1 =_mm512_cmpgt_epi32_mask
815
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
816
0
            int _imask2 = _mm512_cmpgt_epi32_mask
817
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
818
819
0
            __m512i renorm_words1 = _mm512_cvtepu16_epi32
820
0
                (_mm256_loadu_si256((const __m256i *)sp));
821
0
            sp += _mm_popcnt_u32(_imask1);
822
823
0
            __m512i renorm_words2 = _mm512_cvtepu16_epi32
824
0
                (_mm256_loadu_si256((const __m256i *)sp));
825
0
            sp += _mm_popcnt_u32(_imask2);
826
827
0
            __m512i _renorm_vals1 =
828
0
                _mm512_maskz_expand_epi32(_imask1, renorm_words1);
829
0
            __m512i _renorm_vals2 =
830
0
                _mm512_maskz_expand_epi32(_imask2, renorm_words2);
831
832
0
            _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
833
0
            _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
834
835
0
            _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
836
0
            _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
837
838
0
#ifdef TBUF8
839
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
840
0
                             _mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
841
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
842
0
                             _mm512_cvtepi32_epi8(_Sv2));
843
#else
844
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
845
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
846
#endif
847
848
0
            iN[0]++;
849
0
            if (++tidx == 32) {
850
0
                iN[0]-=32;
851
852
                // We have tidx[x][y] which we want to store in
853
                // memory in out[y][z] instead.  This is an unrolled
854
                // transposition.
855
0
#ifdef TBUF8
856
0
                transpose_and_copy(out, iN, u.tbuf);
857
#else
858
                transpose_and_copy_avx512(out, iN, tbuf);
859
#endif
860
0
                tidx = 0;
861
0
            }
862
0
        }
863
0
        isz4 += 64;
864
865
0
        STORE512(_Rv, R);
866
0
        STORE512(_Lv, lN);
867
0
        ptr = (uint8_t *)sp;
868
869
0
        if (1) {
870
0
            iN[0]-=tidx;
871
0
            int T;
872
0
            for (z = 0; z < NX; z++)
873
0
                for (T = 0; T < tidx; T++)
874
0
#ifdef TBUF8
875
0
                    out[iN[z]++] = u.tbuf[T][z];
876
#else
877
                    out[iN[z]++] = tbuf[T][z];
878
#endif
879
0
        }
880
881
        // Scalar version for close to the end of in[] array so we don't
882
        // do SIMD loads beyond the end of the buffer
883
0
        for (; iN[0] < isz4;) {
884
0
            for (z = 0; z < NX; z++) {
885
0
                uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
886
0
                uint32_t S = s3[lN[z]][m];
887
0
                unsigned char c = S & 0xff;
888
0
                out[iN[z]++] = c;
889
0
                uint32_t F = S>>(TF_SHIFT_O1+8);
890
0
                R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
891
0
                    ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
892
0
                RansDecRenormSafe(&R[z], &ptr, ptr_end);
893
0
                lN[z] = c;
894
0
            }
895
0
        }
896
897
        // Remainder
898
0
        z = NX-1;
899
0
        for (; iN[z] < out_sz; ) {
900
0
            uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
901
0
            uint32_t S = s3[lN[z]][m];
902
0
            unsigned char c = S & 0xff;
903
0
            out[iN[z]++] = c;
904
0
            uint32_t F = S>>(TF_SHIFT_O1+8);
905
0
            R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
906
0
                ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
907
0
            RansDecRenormSafe(&R[z], &ptr, ptr_end);
908
0
            lN[z] = c;
909
0
        }
910
0
    } else {
911
        // TF_SHIFT_O1_FAST.  This is the most commonly used variant.
912
913
        // SIMD version ends decoding early as it reads at most 64 bytes
914
        // from input via 4 vectorised loads.
915
0
        isz4 -= 64;
916
0
        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
917
            // m[z] = R[z] & mask;
918
0
            __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
919
0
            __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
920
921
            //  S[z] = s3[lN[z]][m[z]];
922
0
            _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1_FAST);
923
0
            _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1_FAST);
924
925
0
            _masked1 = _mm512_add_epi32(_masked1, _Lv1);
926
0
            _masked2 = _mm512_add_epi32(_masked2, _Lv2);
927
928
            // This is the biggest bottleneck
929
0
            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
930
0
                                                  sizeof(s3F[0][0]));
931
0
            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
932
0
                                                  sizeof(s3F[0][0]));
933
934
            //  f[z] = S[z]>>(TF_SHIFT_O1+8);
935
0
            __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8);
936
0
            __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8);
937
938
            //  b[z] = (S[z]>>8) & mask;
939
0
            __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
940
0
            __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
941
942
            //  s[z] = S[z] & 0xff;
943
0
            __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
944
0
            __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
945
946
            //  R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
947
0
            _Rv1 = _mm512_add_epi32(
948
0
                       _mm512_mullo_epi32(
949
0
                           _mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST),
950
0
                           _fv1), _bv1);
951
0
            _Rv2 = _mm512_add_epi32(
952
0
                       _mm512_mullo_epi32(
953
0
                           _mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST),
954
0
                           _fv2), _bv2);
955
956
            //for (z = 0; z < NX; z++) lN[z] = c[z];
957
0
            _Lv1 = _sv1;
958
0
            _Lv2 = _sv2;
959
960
            // RansDecRenorm(&R[z], &ptr);
961
0
            __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
962
0
                                        _mm512_set1_epi32(0x80000000));
963
0
            __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
964
0
                                        _mm512_set1_epi32(0x80000000));
965
966
0
            int _imask1 =_mm512_cmpgt_epi32_mask
967
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
968
0
            int _imask2 = _mm512_cmpgt_epi32_mask
969
0
                (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
970
971
0
            __m512i renorm_words1 = _mm512_cvtepu16_epi32
972
0
                (_mm256_loadu_si256((const __m256i *)sp));
973
0
            sp += _mm_popcnt_u32(_imask1);
974
975
0
            __m512i renorm_words2 = _mm512_cvtepu16_epi32
976
0
                (_mm256_loadu_si256((const __m256i *)sp));
977
0
            sp += _mm_popcnt_u32(_imask2);
978
979
0
            __m512i _renorm_vals1 =
980
0
                _mm512_maskz_expand_epi32(_imask1, renorm_words1);
981
0
            __m512i _renorm_vals2 =
982
0
                _mm512_maskz_expand_epi32(_imask2, renorm_words2);
983
984
0
            _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
985
0
            _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
986
987
0
            _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
988
0
            _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
989
990
0
#ifdef TBUF8
991
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
992
0
                             _mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
993
0
            _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
994
0
                             _mm512_cvtepi32_epi8(_Sv2));
995
#else
996
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
997
            _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
998
#endif
999
1000
0
            iN[0]++;
1001
0
            if (++tidx == 32) {
1002
0
                iN[0]-=32;
1003
0
#ifdef TBUF8
1004
0
                transpose_and_copy(out, iN, u.tbuf);
1005
#else
1006
                transpose_and_copy_avx512(out, iN, tbuf);
1007
#endif
1008
0
                tidx = 0;
1009
0
            }
1010
0
        }
1011
0
        isz4 += 64;
1012
1013
0
        STORE512(_Rv, R);
1014
0
        STORE512(_Lv, lN);
1015
0
        ptr = (uint8_t *)sp;
1016
1017
0
        if (1) {
1018
0
            iN[0]-=tidx;
1019
0
            int T;
1020
0
            for (z = 0; z < NX; z++)
1021
0
                for (T = 0; T < tidx; T++)
1022
0
#ifdef TBUF8
1023
0
                    out[iN[z]++] = u.tbuf[T][z];
1024
#else
1025
                    out[iN[z]++] = tbuf[T][z];
1026
#endif
1027
0
        }
1028
1029
        // Scalar version for close to the end of in[] array so we don't
1030
        // do SIMD loads beyond the end of the buffer
1031
0
        for (; iN[0] < isz4;) {
1032
0
            for (z = 0; z < NX; z++) {
1033
0
                uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
1034
0
                uint32_t S = s3F[lN[z]][m];
1035
0
                unsigned char c = S & 0xff;
1036
0
                out[iN[z]++] = c;
1037
0
                R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
1038
0
                    ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
1039
0
                RansDecRenormSafe(&R[z], &ptr, ptr_end);
1040
0
                lN[z] = c;
1041
0
            }
1042
0
        }
1043
1044
        // Remainder
1045
0
        z = NX-1;
1046
0
        for (; iN[z] < out_sz; ) {
1047
0
            uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
1048
0
            uint32_t S = s3F[lN[z]][m];
1049
0
            unsigned char c = S & 0xff;
1050
0
            out[iN[z]++] = c;
1051
0
            R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
1052
0
                ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
1053
0
            RansDecRenormSafe(&R[z], &ptr, ptr_end);
1054
0
            lN[z] = c;
1055
0
        }
1056
0
    }
1057
1058
0
    htscodecs_tls_free(s3);
1059
0
    return out;
1060
1061
0
 err:
1062
0
    htscodecs_tls_free(s3);
1063
0
    free(out_free);
1064
0
    free(c_freq);
1065
1066
0
    return NULL;
1067
0
}
1068
1069
#endif // HAVE_AVX512