_Z17setup_fake_randomPKhm:
   53|    316|setup_fake_random(const unsigned char * seed, const size_t seed_size) {
   54|    316|  SEED_DATA = seed;
   55|    316|  SEED_SIZE = seed_size;
   56|       |
   57|    316|  int fake_random_set = randombytes_set_implementation(&fake_random);
   58|    316|  assert(fake_random_set == 0);
   59|       |
   60|    316|  assert(strcmp(randombytes_implementation_name(), "fake_random") == 0);
   61|    316|  int initialized = sodium_init();
   62|    316|  assert(initialized >= 0);
   63|    316|}
secretbox_easy_fuzzer.cc:_ZL24fake_implementation_namev:
   28|    316|fake_implementation_name(void) {
   29|    316|  return "fake_random";
   30|    316|}
secretbox_easy_fuzzer.cc:_ZL18fake_random_bufferPvm:
   33|    632|fake_random_buffer(void * const buf, const size_t size) {
   34|    632|  static unsigned char seed[randombytes_SEEDBYTES];
   35|    632|  memset(seed, '0', randombytes_SEEDBYTES);
  ------------------
  |  |   30|    632|#define randombytes_SEEDBYTES 32U
  ------------------
   36|       |
   37|    632|  size_t boundary = std::min((size_t) randombytes_SEEDBYTES, SEED_SIZE);
  ------------------
  |  |   30|    632|#define randombytes_SEEDBYTES 32U
  ------------------
   38|    632|  memcpy(&seed, SEED_DATA, boundary);
   39|       |
   40|    632|  randombytes_buf_deterministic(buf, size, seed);
   41|    632|}

_crypto_aead_aegis128l_pick_best_implementation:
  142|      1|{
  143|      1|    implementation = &aegis128l_soft_implementation;
  144|       |
  145|       |#if defined(HAVE_ARMCRYPTO) && defined(NATIVE_LITTLE_ENDIAN)
  146|       |    if (sodium_runtime_has_armcrypto()) {
  147|       |        implementation = &aegis128l_armcrypto_implementation;
  148|       |        return 0;
  149|       |    }
  150|       |#endif
  151|       |
  152|      1|#if defined(HAVE_AVXINTRIN_H) && defined(HAVE_WMMINTRIN_H)
  153|      1|    if (sodium_runtime_has_aesni() & sodium_runtime_has_avx()) {
  ------------------
  |  Branch (153:9): [True: 1, False: 0]
  ------------------
  154|      1|        implementation = &aegis128l_aesni_implementation;
  155|      1|        return 0;
  156|      1|    }
  157|      0|#endif
  158|      0|    return 0; /* LCOV_EXCL_LINE */
  159|      1|}

_crypto_aead_aegis256_pick_best_implementation:
  141|      1|{
  142|      1|    implementation = &aegis256_soft_implementation;
  143|       |
  144|       |#if defined(HAVE_ARMCRYPTO) && defined(NATIVE_LITTLE_ENDIAN)
  145|       |    if (sodium_runtime_has_armcrypto()) {
  146|       |        implementation = &aegis256_armcrypto_implementation;
  147|       |        return 0;
  148|       |    }
  149|       |#endif
  150|       |
  151|      1|#if defined(HAVE_AVXINTRIN_H) && defined(HAVE_WMMINTRIN_H)
  152|      1|    if (sodium_runtime_has_aesni() & sodium_runtime_has_avx()) {
  ------------------
  |  Branch (152:9): [True: 1, False: 0]
  ------------------
  153|      1|        implementation = &aegis256_aesni_implementation;
  154|      1|        return 0;
  155|      1|    }
  156|      0|#endif
  157|      0|    return 0; /* LCOV_EXCL_LINE */
  158|      1|}

crypto_core_hsalsa20:
   21|    632|{
   22|    632|    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8,
   23|    632|             x9, x10, x11, x12, x13, x14,  x15;
   24|    632|    int      i;
   25|       |
   26|    632|    if (c == NULL) {
  ------------------
  |  Branch (26:9): [True: 632, False: 0]
  ------------------
   27|    632|        x0 = U32C(0x61707865);
  ------------------
  |  |   14|    632|#define U32C(v) (v##U)
  ------------------
   28|    632|        x5 = U32C(0x3320646e);
  ------------------
  |  |   14|    632|#define U32C(v) (v##U)
  ------------------
   29|    632|        x10 = U32C(0x79622d32);
  ------------------
  |  |   14|    632|#define U32C(v) (v##U)
  ------------------
   30|    632|        x15 = U32C(0x6b206574);
  ------------------
  |  |   14|    632|#define U32C(v) (v##U)
  ------------------
   31|    632|    } else {
   32|      0|        x0 = LOAD32_LE(c + 0);
  ------------------
  |  |  111|      0|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   33|      0|        x5 = LOAD32_LE(c + 4);
  ------------------
  |  |  111|      0|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   34|      0|        x10 = LOAD32_LE(c + 8);
  ------------------
  |  |  111|      0|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   35|      0|        x15 = LOAD32_LE(c + 12);
  ------------------
  |  |  111|      0|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   36|      0|    }
   37|    632|    x1 = LOAD32_LE(k + 0);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   38|    632|    x2 = LOAD32_LE(k + 4);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   39|    632|    x3 = LOAD32_LE(k + 8);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   40|    632|    x4 = LOAD32_LE(k + 12);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   41|    632|    x11 = LOAD32_LE(k + 16);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   42|    632|    x12 = LOAD32_LE(k + 20);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   43|    632|    x13 = LOAD32_LE(k + 24);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   44|    632|    x14 = LOAD32_LE(k + 28);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   45|    632|    x6 = LOAD32_LE(in + 0);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   46|    632|    x7 = LOAD32_LE(in + 4);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   47|    632|    x8 = LOAD32_LE(in + 8);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   48|    632|    x9 = LOAD32_LE(in + 12);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   49|       |
   50|  6.95k|    for (i = ROUNDS; i > 0; i -= 2) {
  ------------------
  |  |   13|    632|#define ROUNDS 20
  ------------------
  |  Branch (50:22): [True: 6.32k, False: 632]
  ------------------
   51|  6.32k|        x4 ^= ROTL32(x0 + x12, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   52|  6.32k|        x8 ^= ROTL32(x4 + x0, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   53|  6.32k|        x12 ^= ROTL32(x8 + x4, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   54|  6.32k|        x0 ^= ROTL32(x12 + x8, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   55|  6.32k|        x9 ^= ROTL32(x5 + x1, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   56|  6.32k|        x13 ^= ROTL32(x9 + x5, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   57|  6.32k|        x1 ^= ROTL32(x13 + x9, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   58|  6.32k|        x5 ^= ROTL32(x1 + x13, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   59|  6.32k|        x14 ^= ROTL32(x10 + x6, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   60|  6.32k|        x2 ^= ROTL32(x14 + x10, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   61|  6.32k|        x6 ^= ROTL32(x2 + x14, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   62|  6.32k|        x10 ^= ROTL32(x6 + x2, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   63|  6.32k|        x3 ^= ROTL32(x15 + x11, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   64|  6.32k|        x7 ^= ROTL32(x3 + x15, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   65|  6.32k|        x11 ^= ROTL32(x7 + x3, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   66|  6.32k|        x15 ^= ROTL32(x11 + x7, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   67|  6.32k|        x1 ^= ROTL32(x0 + x3, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   68|  6.32k|        x2 ^= ROTL32(x1 + x0, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   69|  6.32k|        x3 ^= ROTL32(x2 + x1, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   70|  6.32k|        x0 ^= ROTL32(x3 + x2, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   71|  6.32k|        x6 ^= ROTL32(x5 + x4, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   72|  6.32k|        x7 ^= ROTL32(x6 + x5, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   73|  6.32k|        x4 ^= ROTL32(x7 + x6, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   74|  6.32k|        x5 ^= ROTL32(x4 + x7, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   75|  6.32k|        x11 ^= ROTL32(x10 + x9, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   76|  6.32k|        x8 ^= ROTL32(x11 + x10, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   77|  6.32k|        x9 ^= ROTL32(x8 + x11, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   78|  6.32k|        x10 ^= ROTL32(x9 + x8, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   79|  6.32k|        x12 ^= ROTL32(x15 + x14, 7);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   80|  6.32k|        x13 ^= ROTL32(x12 + x15, 9);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   81|  6.32k|        x14 ^= ROTL32(x13 + x12, 13);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   82|  6.32k|        x15 ^= ROTL32(x14 + x13, 18);
  ------------------
  |  |   42|  6.32k|# define ROTL32(X, B) rotl32((X), (B))
  ------------------
   83|  6.32k|    }
   84|       |
   85|    632|    STORE32_LE(out + 0, x0);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   86|    632|    STORE32_LE(out + 4, x5);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   87|    632|    STORE32_LE(out + 8, x10);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   88|    632|    STORE32_LE(out + 12, x15);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   89|    632|    STORE32_LE(out + 16, x6);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   90|    632|    STORE32_LE(out + 20, x7);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   91|    632|    STORE32_LE(out + 24, x8);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   92|    632|    STORE32_LE(out + 28, x9);
  ------------------
  |  |  128|    632|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
   93|       |
   94|    632|    return 0;
   95|    632|}

blake2b_pick_best_implementation:
  412|      1|{
  413|       |/* LCOV_EXCL_START */
  414|      1|#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
  415|      1|    defined(HAVE_SMMINTRIN_H)
  416|      1|    if (sodium_runtime_has_avx2()) {
  ------------------
  |  Branch (416:9): [True: 1, False: 0]
  ------------------
  417|      1|        blake2b_compress = blake2b_compress_avx2;
  418|      1|        return 0;
  419|      1|    }
  420|      0|#endif
  421|      0|#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
  422|      0|    defined(HAVE_SMMINTRIN_H)
  423|      0|    if (sodium_runtime_has_sse41()) {
  ------------------
  |  Branch (423:9): [True: 0, False: 0]
  ------------------
  424|      0|        blake2b_compress = blake2b_compress_sse41;
  425|      0|        return 0;
  426|      0|    }
  427|      0|#endif
  428|      0|#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
  429|      0|    if (sodium_runtime_has_ssse3()) {
  ------------------
  |  Branch (429:9): [True: 0, False: 0]
  ------------------
  430|      0|        blake2b_compress = blake2b_compress_ssse3;
  431|      0|        return 0;
  432|      0|    }
  433|      0|#endif
  434|      0|    blake2b_compress = blake2b_compress_ref;
  435|       |
  436|      0|    return 0;
  437|       |    /* LCOV_EXCL_STOP */
  438|      0|}

_crypto_generichash_blake2b_pick_best_implementation:
  114|      1|{
  115|      1|    return blake2b_pick_best_implementation();
  116|      1|}

crypto_onetimeauth_poly1305_verify:
   29|    316|{
   30|    316|    return implementation->onetimeauth_verify(h, in, inlen, k);
   31|    316|}
crypto_onetimeauth_poly1305_init:
   36|    316|{
   37|    316|    return implementation->onetimeauth_init(state, key);
   38|    316|}
crypto_onetimeauth_poly1305_update:
   44|    316|{
   45|    316|    return implementation->onetimeauth_update(state, in, inlen);
   46|    316|}
crypto_onetimeauth_poly1305_final:
   51|    316|{
   52|    316|    return implementation->onetimeauth_final(state, out);
   53|    316|}
_crypto_onetimeauth_poly1305_pick_best_implementation:
   82|      1|{
   83|      1|    implementation = &crypto_onetimeauth_poly1305_donna_implementation;
   84|      1|#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
   85|      1|    if (sodium_runtime_has_sse2()) {
  ------------------
  |  Branch (85:9): [True: 1, False: 0]
  ------------------
   86|      1|        implementation = &crypto_onetimeauth_poly1305_sse2_implementation;
   87|      1|    }
   88|      1|#endif
   89|      1|    return 0;
   90|      1|}

poly1305_sse2.c:crypto_onetimeauth_poly1305_sse2:
  913|    316|{
  914|    316|    CRYPTO_ALIGN(64) poly1305_state_internal_t st;
  ------------------
  |  |   50|    316|#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
  ------------------
  915|    316|    unsigned long long                         blocks;
  916|       |
  917|    316|    poly1305_init_ext(&st, key, inlen);
  918|    316|    blocks = inlen & ~31;
  919|    316|    if (blocks > 0) {
  ------------------
  |  Branch (919:9): [True: 243, False: 73]
  ------------------
  920|    243|        poly1305_blocks(&st, m, blocks);
  921|    243|        m += blocks;
  922|    243|        inlen -= blocks;
  923|    243|    }
  924|    316|    poly1305_finish_ext(&st, m, inlen, out);
  925|       |
  926|    316|    return 0;
  927|    316|}
poly1305_sse2.c:poly1305_init_ext:
  107|    632|{
  108|    632|    uint32_t          *R;
  109|    632|    uint128_t          d[3];
  110|    632|    uint64_t           r0, r1, r2;
  111|    632|    uint64_t           rt0, rt1, rt2, st2, c;
  112|    632|    uint64_t           t0, t1;
  113|    632|    unsigned long long i;
  114|       |
  115|    632|    if (!bytes) {
  ------------------
  |  Branch (115:9): [True: 316, False: 316]
  ------------------
  116|    316|        bytes = ~(unsigned long long) 0;
  117|    316|    }
  118|       |    /* H = 0 */
  119|    632|    _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
  120|    632|    _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
  121|    632|    _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
  122|       |
  123|       |    /* clamp key */
  124|    632|    memcpy(&t0, key, 8);
  125|    632|    memcpy(&t1, key + 8, 8);
  126|    632|    r0 = t0 & 0xffc0fffffff;
  127|    632|    t0 >>= 44;
  128|    632|    t0 |= t1 << 20;
  129|    632|    r1 = t0 & 0xfffffc0ffff;
  130|    632|    t1 >>= 24;
  131|    632|    r2 = t1 & 0x00ffffffc0f;
  132|       |
  133|       |    /* r^1 */
  134|    632|    R    = st->R;
  135|    632|    R[0] = (uint32_t)(r0) &0x3ffffff;
  136|    632|    R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  137|    632|    R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  138|    632|    R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  139|    632|    R[4] = (uint32_t)((r2 >> 16));
  140|       |
  141|       |    /* save pad */
  142|    632|    memcpy(&st->pad[0], key + 16, 8);
  143|    632|    memcpy(&st->pad[1], key + 24, 8);
  144|       |
  145|    632|    rt0 = r0;
  146|    632|    rt1 = r1;
  147|    632|    rt2 = r2;
  148|       |
  149|       |    /* r^2, r^4 */
  150|  1.75k|    for (i = 0; i < 2; i++) {
  ------------------
  |  Branch (150:17): [True: 1.21k, False: 543]
  ------------------
  151|  1.21k|        if (i == 0) {
  ------------------
  |  Branch (151:13): [True: 632, False: 583]
  ------------------
  152|    632|            R = st->R2;
  153|    632|            if (bytes <= 16) {
  ------------------
  |  Branch (153:17): [True: 49, False: 583]
  ------------------
  154|     49|                break;
  155|     49|            }
  156|    632|        } else if (i == 1) {
  ------------------
  |  Branch (156:20): [True: 583, False: 0]
  ------------------
  157|    583|            R = st->R4;
  158|    583|            if (bytes < 96) {
  ------------------
  |  Branch (158:17): [True: 40, False: 543]
  ------------------
  159|     40|                break;
  160|     40|            }
  161|    583|        }
  162|  1.12k|        st2 = rt2 * (5 << 2);
  163|       |
  164|  1.12k|        d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
  165|  1.12k|        d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
  166|  1.12k|        d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
  167|       |
  168|  1.12k|        rt0 = (uint64_t) d[0] & 0xfffffffffff;
  169|  1.12k|        c   = (uint64_t)(d[0] >> 44);
  170|  1.12k|        d[1] += c;
  171|       |
  172|  1.12k|        rt1 = (uint64_t) d[1] & 0xfffffffffff;
  173|  1.12k|        c   = (uint64_t)(d[1] >> 44);
  174|  1.12k|        d[2] += c;
  175|       |
  176|  1.12k|        rt2 = (uint64_t) d[2] & 0x3ffffffffff;
  177|  1.12k|        c   = (uint64_t)(d[2] >> 42);
  178|  1.12k|        rt0 += c * 5;
  179|  1.12k|        c   = (rt0 >> 44);
  180|  1.12k|        rt0 = rt0 & 0xfffffffffff;
  181|  1.12k|        rt1 += c;
  182|  1.12k|        c   = (rt1 >> 44);
  183|  1.12k|        rt1 = rt1 & 0xfffffffffff;
  184|  1.12k|        rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
  185|       |                     is safe to multiply with */
  186|       |
  187|  1.12k|        R[0] = (uint32_t)(rt0) &0x3ffffff;
  188|  1.12k|        R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
  189|  1.12k|        R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
  190|  1.12k|        R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
  191|  1.12k|        R[4] = (uint32_t)((rt2 >> 16));
  192|  1.12k|    }
  193|    632|    st->flags    = 0;
  194|    632|    st->leftover = 0U;
  195|    632|}
poly1305_sse2.c:poly1305_blocks:
  202|  1.58k|{
  203|  1.58k|    CRYPTO_ALIGN(64)
  ------------------
  |  |   50|  1.58k|#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
  ------------------
  204|  1.58k|    xmmi HIBIT =
  205|  1.58k|        _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
  206|  1.58k|    const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
  207|  1.58k|                                         _MM_SHUFFLE(1, 0, 1, 0));
  208|  1.58k|    const xmmi FIVE =
  209|  1.58k|        _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
  210|  1.58k|    xmmi H0, H1, H2, H3, H4;
  211|  1.58k|    xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
  212|  1.58k|    xmmi M0, M1, M2, M3, M4;
  213|  1.58k|    xmmi M5, M6, M7, M8;
  214|  1.58k|    xmmi C1, C2;
  215|  1.58k|    xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
  216|  1.58k|    xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
  217|       |
  218|  1.58k|    if (st->flags & poly1305_final_shift8) {
  ------------------
  |  Branch (218:9): [True: 504, False: 1.08k]
  ------------------
  219|    504|        HIBIT = _mm_srli_si128(HIBIT, 8);
  220|    504|    }
  221|  1.58k|    if (st->flags & poly1305_final_shift16) {
  ------------------
  |  Branch (221:9): [True: 428, False: 1.15k]
  ------------------
  222|    428|        HIBIT = _mm_setzero_si128();
  223|    428|    }
  224|  1.58k|    if (!(st->flags & poly1305_started)) {
  ------------------
  |  Branch (224:9): [True: 632, False: 952]
  ------------------
  225|       |        /* H = [Mx,My] */
  226|    632|        T5 = _mm_unpacklo_epi64(
  227|    632|            _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
  228|    632|            _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
  229|    632|        T6 = _mm_unpacklo_epi64(
  230|    632|            _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
  231|    632|            _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
  232|    632|        H0 = _mm_and_si128(MMASK, T5);
  233|    632|        H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  234|    632|        T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  235|    632|        H2 = _mm_and_si128(MMASK, T5);
  236|    632|        H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  237|    632|        H4 = _mm_srli_epi64(T6, 40);
  238|    632|        H4 = _mm_or_si128(H4, HIBIT);
  239|    632|        m += 32;
  240|    632|        bytes -= 32;
  241|    632|        st->flags |= poly1305_started;
  242|    952|    } else {
  243|    952|        T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
  244|    952|        T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
  245|    952|        T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
  246|    952|        H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
  247|    952|        H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
  248|    952|        H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
  249|    952|        H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
  250|    952|        H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
  251|    952|    }
  252|  1.58k|    if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
  ------------------
  |  Branch (252:9): [True: 632, False: 952]
  ------------------
  253|    632|        if (st->flags & poly1305_final_r2_r) {
  ------------------
  |  Branch (253:13): [True: 384, False: 248]
  ------------------
  254|       |            /* use [r^2, r] */
  255|    384|            T2  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
  256|    384|            T3  = _mm_cvtsi32_si128(st->R[4]);
  257|    384|            T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
  258|    384|            T1  = _mm_cvtsi32_si128(st->R2[4]);
  259|    384|            T4  = _mm_unpacklo_epi32(T0, T2);
  260|    384|            T5  = _mm_unpackhi_epi32(T0, T2);
  261|    384|            R24 = _mm_unpacklo_epi64(T1, T3);
  262|    384|        } else {
  263|       |            /* use [r^1, 1] */
  264|    248|            T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
  265|    248|            T1  = _mm_cvtsi32_si128(st->R[4]);
  266|    248|            T2  = _mm_cvtsi32_si128(1);
  267|    248|            T4  = _mm_unpacklo_epi32(T0, T2);
  268|    248|            T5  = _mm_unpackhi_epi32(T0, T2);
  269|    248|            R24 = T1;
  270|    248|        }
  271|    632|        R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
  272|    632|        R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
  273|    632|        R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
  274|    632|        R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
  275|    952|    } else {
  276|       |        /* use [r^2, r^2] */
  277|    952|        T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
  278|    952|        T1  = _mm_cvtsi32_si128(st->R2[4]);
  279|    952|        R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
  280|    952|        R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
  281|    952|        R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
  282|    952|        R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
  283|    952|        R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
  284|    952|    }
  285|  1.58k|    S21 = _mm_mul_epu32(R21, FIVE);
  286|  1.58k|    S22 = _mm_mul_epu32(R22, FIVE);
  287|  1.58k|    S23 = _mm_mul_epu32(R23, FIVE);
  288|  1.58k|    S24 = _mm_mul_epu32(R24, FIVE);
  289|       |
  290|  1.58k|    if (bytes >= 64) {
  ------------------
  |  Branch (290:9): [True: 454, False: 1.13k]
  ------------------
  291|    454|        T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
  292|    454|        T1  = _mm_cvtsi32_si128(st->R4[4]);
  293|    454|        R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
  294|    454|        R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
  295|    454|        R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
  296|    454|        R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
  297|    454|        R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
  298|    454|        S41 = _mm_mul_epu32(R41, FIVE);
  299|    454|        S42 = _mm_mul_epu32(R42, FIVE);
  300|    454|        S43 = _mm_mul_epu32(R43, FIVE);
  301|    454|        S44 = _mm_mul_epu32(R44, FIVE);
  302|       |
  303|  1.42M|        while (bytes >= 64) {
  ------------------
  |  Branch (303:16): [True: 1.42M, False: 454]
  ------------------
  304|  1.42M|            xmmi v00, v01, v02, v03, v04;
  305|  1.42M|            xmmi v10, v11, v12, v13, v14;
  306|  1.42M|            xmmi v20, v21, v22, v23, v24;
  307|  1.42M|            xmmi v30, v31, v32, v33, v34;
  308|  1.42M|            xmmi v40, v41, v42, v43, v44;
  309|  1.42M|            xmmi T14, T15;
  310|       |
  311|       |            /* H *= [r^4,r^4], preload [Mx,My] */
  312|  1.42M|            T15 = S42;
  313|  1.42M|            T0  = H4;
  314|  1.42M|            T0  = _mm_mul_epu32(T0, S41);
  315|  1.42M|            v01 = H3;
  316|  1.42M|            v01 = _mm_mul_epu32(v01, T15);
  317|  1.42M|            T14 = S43;
  318|  1.42M|            T1  = H4;
  319|  1.42M|            T1  = _mm_mul_epu32(T1, T15);
  320|  1.42M|            v11 = H3;
  321|  1.42M|            v11 = _mm_mul_epu32(v11, T14);
  322|  1.42M|            T2  = H4;
  323|  1.42M|            T2  = _mm_mul_epu32(T2, T14);
  324|  1.42M|            T0  = _mm_add_epi64(T0, v01);
  325|  1.42M|            T15 = S44;
  326|  1.42M|            v02 = H2;
  327|  1.42M|            v02 = _mm_mul_epu32(v02, T14);
  328|  1.42M|            T3  = H4;
  329|  1.42M|            T3  = _mm_mul_epu32(T3, T15);
  330|  1.42M|            T1  = _mm_add_epi64(T1, v11);
  331|  1.42M|            v03 = H1;
  332|  1.42M|            v03 = _mm_mul_epu32(v03, T15);
  333|  1.42M|            v12 = H2;
  334|  1.42M|            v12 = _mm_mul_epu32(v12, T15);
  335|  1.42M|            T0  = _mm_add_epi64(T0, v02);
  336|  1.42M|            T14 = R40;
  337|  1.42M|            v21 = H3;
  338|  1.42M|            v21 = _mm_mul_epu32(v21, T15);
  339|  1.42M|            v31 = H3;
  340|  1.42M|            v31 = _mm_mul_epu32(v31, T14);
  341|  1.42M|            T0  = _mm_add_epi64(T0, v03);
  342|  1.42M|            T4  = H4;
  343|  1.42M|            T4  = _mm_mul_epu32(T4, T14);
  344|  1.42M|            T1  = _mm_add_epi64(T1, v12);
  345|  1.42M|            v04 = H0;
  346|  1.42M|            v04 = _mm_mul_epu32(v04, T14);
  347|  1.42M|            T2  = _mm_add_epi64(T2, v21);
  348|  1.42M|            v13 = H1;
  349|  1.42M|            v13 = _mm_mul_epu32(v13, T14);
  350|  1.42M|            T3  = _mm_add_epi64(T3, v31);
  351|  1.42M|            T15 = R41;
  352|  1.42M|            v22 = H2;
  353|  1.42M|            v22 = _mm_mul_epu32(v22, T14);
  354|  1.42M|            v32 = H2;
  355|  1.42M|            v32 = _mm_mul_epu32(v32, T15);
  356|  1.42M|            T0  = _mm_add_epi64(T0, v04);
  357|  1.42M|            v41 = H3;
  358|  1.42M|            v41 = _mm_mul_epu32(v41, T15);
  359|  1.42M|            T1  = _mm_add_epi64(T1, v13);
  360|  1.42M|            v14 = H0;
  361|  1.42M|            v14 = _mm_mul_epu32(v14, T15);
  362|  1.42M|            T2  = _mm_add_epi64(T2, v22);
  363|  1.42M|            T14 = R42;
  364|  1.42M|            T5  = _mm_unpacklo_epi64(
  365|  1.42M|                _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
  366|  1.42M|                _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
  367|  1.42M|            v23 = H1;
  368|  1.42M|            v23 = _mm_mul_epu32(v23, T15);
  369|  1.42M|            T3  = _mm_add_epi64(T3, v32);
  370|  1.42M|            v33 = H1;
  371|  1.42M|            v33 = _mm_mul_epu32(v33, T14);
  372|  1.42M|            T4  = _mm_add_epi64(T4, v41);
  373|  1.42M|            v42 = H2;
  374|  1.42M|            v42 = _mm_mul_epu32(v42, T14);
  375|  1.42M|            T1  = _mm_add_epi64(T1, v14);
  376|  1.42M|            T15 = R43;
  377|  1.42M|            T6  = _mm_unpacklo_epi64(
  378|  1.42M|                _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
  379|  1.42M|                _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
  380|  1.42M|            v24 = H0;
  381|  1.42M|            v24 = _mm_mul_epu32(v24, T14);
  382|  1.42M|            T2  = _mm_add_epi64(T2, v23);
  383|  1.42M|            v34 = H0;
  384|  1.42M|            v34 = _mm_mul_epu32(v34, T15);
  385|  1.42M|            T3  = _mm_add_epi64(T3, v33);
  386|  1.42M|            M0  = _mm_and_si128(MMASK, T5);
  387|  1.42M|            v43 = H1;
  388|  1.42M|            v43 = _mm_mul_epu32(v43, T15);
  389|  1.42M|            T4  = _mm_add_epi64(T4, v42);
  390|  1.42M|            M1  = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  391|  1.42M|            v44 = H0;
  392|  1.42M|            v44 = _mm_mul_epu32(v44, R44);
  393|  1.42M|            T2  = _mm_add_epi64(T2, v24);
  394|  1.42M|            T5  = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  395|  1.42M|            T3  = _mm_add_epi64(T3, v34);
  396|  1.42M|            M3  = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
  397|  1.42M|            T4  = _mm_add_epi64(T4, v43);
  398|  1.42M|            M2  = _mm_and_si128(MMASK, T5);
  399|  1.42M|            T4  = _mm_add_epi64(T4, v44);
  400|  1.42M|            M4  = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  401|       |
  402|       |            /* H += [Mx',My'] */
  403|  1.42M|            T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
  404|  1.42M|            T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
  405|  1.42M|            T7 = _mm_unpacklo_epi32(T5, T6);
  406|  1.42M|            T8 = _mm_unpackhi_epi32(T5, T6);
  407|  1.42M|            M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
  408|  1.42M|            M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
  409|  1.42M|            M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
  410|  1.42M|            M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
  411|  1.42M|            M6 = _mm_slli_epi64(M6, 6);
  412|  1.42M|            M7 = _mm_slli_epi64(M7, 12);
  413|  1.42M|            M8 = _mm_slli_epi64(M8, 18);
  414|  1.42M|            T0 = _mm_add_epi64(T0, M5);
  415|  1.42M|            T1 = _mm_add_epi64(T1, M6);
  416|  1.42M|            T2 = _mm_add_epi64(T2, M7);
  417|  1.42M|            T3 = _mm_add_epi64(T3, M8);
  418|  1.42M|            T4 = _mm_add_epi64(T4, HIBIT);
  419|       |
  420|       |            /* H += [Mx,My]*[r^2,r^2] */
  421|  1.42M|            T15 = S22;
  422|  1.42M|            v00 = M4;
  423|  1.42M|            v00 = _mm_mul_epu32(v00, S21);
  424|  1.42M|            v01 = M3;
  425|  1.42M|            v01 = _mm_mul_epu32(v01, T15);
  426|  1.42M|            T14 = S23;
  427|  1.42M|            v10 = M4;
  428|  1.42M|            v10 = _mm_mul_epu32(v10, T15);
  429|  1.42M|            v11 = M3;
  430|  1.42M|            v11 = _mm_mul_epu32(v11, T14);
  431|  1.42M|            T0  = _mm_add_epi64(T0, v00);
  432|  1.42M|            v20 = M4;
  433|  1.42M|            v20 = _mm_mul_epu32(v20, T14);
  434|  1.42M|            T0  = _mm_add_epi64(T0, v01);
  435|  1.42M|            T15 = S24;
  436|  1.42M|            v02 = M2;
  437|  1.42M|            v02 = _mm_mul_epu32(v02, T14);
  438|  1.42M|            T1  = _mm_add_epi64(T1, v10);
  439|  1.42M|            v30 = M4;
  440|  1.42M|            v30 = _mm_mul_epu32(v30, T15);
  441|  1.42M|            T1  = _mm_add_epi64(T1, v11);
  442|  1.42M|            v03 = M1;
  443|  1.42M|            v03 = _mm_mul_epu32(v03, T15);
  444|  1.42M|            T2  = _mm_add_epi64(T2, v20);
  445|  1.42M|            v12 = M2;
  446|  1.42M|            v12 = _mm_mul_epu32(v12, T15);
  447|  1.42M|            T0  = _mm_add_epi64(T0, v02);
  448|  1.42M|            T14 = R20;
  449|  1.42M|            v21 = M3;
  450|  1.42M|            v21 = _mm_mul_epu32(v21, T15);
  451|  1.42M|            T3  = _mm_add_epi64(T3, v30);
  452|  1.42M|            v31 = M3;
  453|  1.42M|            v31 = _mm_mul_epu32(v31, T14);
  454|  1.42M|            T0  = _mm_add_epi64(T0, v03);
  455|  1.42M|            v40 = M4;
  456|  1.42M|            v40 = _mm_mul_epu32(v40, T14);
  457|  1.42M|            T1  = _mm_add_epi64(T1, v12);
  458|  1.42M|            v04 = M0;
  459|  1.42M|            v04 = _mm_mul_epu32(v04, T14);
  460|  1.42M|            T2  = _mm_add_epi64(T2, v21);
  461|  1.42M|            v13 = M1;
  462|  1.42M|            v13 = _mm_mul_epu32(v13, T14);
  463|  1.42M|            T3  = _mm_add_epi64(T3, v31);
  464|  1.42M|            T15 = R21;
  465|  1.42M|            v22 = M2;
  466|  1.42M|            v22 = _mm_mul_epu32(v22, T14);
  467|  1.42M|            T4  = _mm_add_epi64(T4, v40);
  468|  1.42M|            v32 = M2;
  469|  1.42M|            v32 = _mm_mul_epu32(v32, T15);
  470|  1.42M|            T0  = _mm_add_epi64(T0, v04);
  471|  1.42M|            v41 = M3;
  472|  1.42M|            v41 = _mm_mul_epu32(v41, T15);
  473|  1.42M|            T1  = _mm_add_epi64(T1, v13);
  474|  1.42M|            v14 = M0;
  475|  1.42M|            v14 = _mm_mul_epu32(v14, T15);
  476|  1.42M|            T2  = _mm_add_epi64(T2, v22);
  477|  1.42M|            T14 = R22;
  478|  1.42M|            v23 = M1;
  479|  1.42M|            v23 = _mm_mul_epu32(v23, T15);
  480|  1.42M|            T3  = _mm_add_epi64(T3, v32);
  481|  1.42M|            v33 = M1;
  482|  1.42M|            v33 = _mm_mul_epu32(v33, T14);
  483|  1.42M|            T4  = _mm_add_epi64(T4, v41);
  484|  1.42M|            v42 = M2;
  485|  1.42M|            v42 = _mm_mul_epu32(v42, T14);
  486|  1.42M|            T1  = _mm_add_epi64(T1, v14);
  487|  1.42M|            T15 = R23;
  488|  1.42M|            v24 = M0;
  489|  1.42M|            v24 = _mm_mul_epu32(v24, T14);
  490|  1.42M|            T2  = _mm_add_epi64(T2, v23);
  491|  1.42M|            v34 = M0;
  492|  1.42M|            v34 = _mm_mul_epu32(v34, T15);
  493|  1.42M|            T3  = _mm_add_epi64(T3, v33);
  494|  1.42M|            v43 = M1;
  495|  1.42M|            v43 = _mm_mul_epu32(v43, T15);
  496|  1.42M|            T4  = _mm_add_epi64(T4, v42);
  497|  1.42M|            v44 = M0;
  498|  1.42M|            v44 = _mm_mul_epu32(v44, R24);
  499|  1.42M|            T2  = _mm_add_epi64(T2, v24);
  500|  1.42M|            T3  = _mm_add_epi64(T3, v34);
  501|  1.42M|            T4  = _mm_add_epi64(T4, v43);
  502|  1.42M|            T4  = _mm_add_epi64(T4, v44);
  503|       |
  504|       |            /* reduce */
  505|  1.42M|            C1 = _mm_srli_epi64(T0, 26);
  506|  1.42M|            C2 = _mm_srli_epi64(T3, 26);
  507|  1.42M|            T0 = _mm_and_si128(T0, MMASK);
  508|  1.42M|            T3 = _mm_and_si128(T3, MMASK);
  509|  1.42M|            T1 = _mm_add_epi64(T1, C1);
  510|  1.42M|            T4 = _mm_add_epi64(T4, C2);
  511|  1.42M|            C1 = _mm_srli_epi64(T1, 26);
  512|  1.42M|            C2 = _mm_srli_epi64(T4, 26);
  513|  1.42M|            T1 = _mm_and_si128(T1, MMASK);
  514|  1.42M|            T4 = _mm_and_si128(T4, MMASK);
  515|  1.42M|            T2 = _mm_add_epi64(T2, C1);
  516|  1.42M|            T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  517|  1.42M|            C1 = _mm_srli_epi64(T2, 26);
  518|  1.42M|            C2 = _mm_srli_epi64(T0, 26);
  519|  1.42M|            T2 = _mm_and_si128(T2, MMASK);
  520|  1.42M|            T0 = _mm_and_si128(T0, MMASK);
  521|  1.42M|            T3 = _mm_add_epi64(T3, C1);
  522|  1.42M|            T1 = _mm_add_epi64(T1, C2);
  523|  1.42M|            C1 = _mm_srli_epi64(T3, 26);
  524|  1.42M|            T3 = _mm_and_si128(T3, MMASK);
  525|  1.42M|            T4 = _mm_add_epi64(T4, C1);
  526|       |
  527|       |            /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
  528|  1.42M|            H0 = T0;
  529|  1.42M|            H1 = T1;
  530|  1.42M|            H2 = T2;
  531|  1.42M|            H3 = T3;
  532|  1.42M|            H4 = T4;
  533|       |
  534|  1.42M|            m += 64;
  535|  1.42M|            bytes -= 64;
  536|  1.42M|        }
  537|    454|    }
  538|       |
  539|  1.58k|    if (bytes >= 32) {
  ------------------
  |  Branch (539:9): [True: 1.16k, False: 416]
  ------------------
  540|  1.16k|        xmmi v01, v02, v03, v04;
  541|  1.16k|        xmmi v11, v12, v13, v14;
  542|  1.16k|        xmmi v21, v22, v23, v24;
  543|  1.16k|        xmmi v31, v32, v33, v34;
  544|  1.16k|        xmmi v41, v42, v43, v44;
  545|  1.16k|        xmmi T14, T15;
  546|       |
  547|       |        /* H *= [r^2,r^2] */
  548|  1.16k|        T15 = S22;
  549|  1.16k|        T0  = H4;
  550|  1.16k|        T0  = _mm_mul_epu32(T0, S21);
  551|  1.16k|        v01 = H3;
  552|  1.16k|        v01 = _mm_mul_epu32(v01, T15);
  553|  1.16k|        T14 = S23;
  554|  1.16k|        T1  = H4;
  555|  1.16k|        T1  = _mm_mul_epu32(T1, T15);
  556|  1.16k|        v11 = H3;
  557|  1.16k|        v11 = _mm_mul_epu32(v11, T14);
  558|  1.16k|        T2  = H4;
  559|  1.16k|        T2  = _mm_mul_epu32(T2, T14);
  560|  1.16k|        T0  = _mm_add_epi64(T0, v01);
  561|  1.16k|        T15 = S24;
  562|  1.16k|        v02 = H2;
  563|  1.16k|        v02 = _mm_mul_epu32(v02, T14);
  564|  1.16k|        T3  = H4;
  565|  1.16k|        T3  = _mm_mul_epu32(T3, T15);
  566|  1.16k|        T1  = _mm_add_epi64(T1, v11);
  567|  1.16k|        v03 = H1;
  568|  1.16k|        v03 = _mm_mul_epu32(v03, T15);
  569|  1.16k|        v12 = H2;
  570|  1.16k|        v12 = _mm_mul_epu32(v12, T15);
  571|  1.16k|        T0  = _mm_add_epi64(T0, v02);
  572|  1.16k|        T14 = R20;
  573|  1.16k|        v21 = H3;
  574|  1.16k|        v21 = _mm_mul_epu32(v21, T15);
  575|  1.16k|        v31 = H3;
  576|  1.16k|        v31 = _mm_mul_epu32(v31, T14);
  577|  1.16k|        T0  = _mm_add_epi64(T0, v03);
  578|  1.16k|        T4  = H4;
  579|  1.16k|        T4  = _mm_mul_epu32(T4, T14);
  580|  1.16k|        T1  = _mm_add_epi64(T1, v12);
  581|  1.16k|        v04 = H0;
  582|  1.16k|        v04 = _mm_mul_epu32(v04, T14);
  583|  1.16k|        T2  = _mm_add_epi64(T2, v21);
  584|  1.16k|        v13 = H1;
  585|  1.16k|        v13 = _mm_mul_epu32(v13, T14);
  586|  1.16k|        T3  = _mm_add_epi64(T3, v31);
  587|  1.16k|        T15 = R21;
  588|  1.16k|        v22 = H2;
  589|  1.16k|        v22 = _mm_mul_epu32(v22, T14);
  590|  1.16k|        v32 = H2;
  591|  1.16k|        v32 = _mm_mul_epu32(v32, T15);
  592|  1.16k|        T0  = _mm_add_epi64(T0, v04);
  593|  1.16k|        v41 = H3;
  594|  1.16k|        v41 = _mm_mul_epu32(v41, T15);
  595|  1.16k|        T1  = _mm_add_epi64(T1, v13);
  596|  1.16k|        v14 = H0;
  597|  1.16k|        v14 = _mm_mul_epu32(v14, T15);
  598|  1.16k|        T2  = _mm_add_epi64(T2, v22);
  599|  1.16k|        T14 = R22;
  600|  1.16k|        v23 = H1;
  601|  1.16k|        v23 = _mm_mul_epu32(v23, T15);
  602|  1.16k|        T3  = _mm_add_epi64(T3, v32);
  603|  1.16k|        v33 = H1;
  604|  1.16k|        v33 = _mm_mul_epu32(v33, T14);
  605|  1.16k|        T4  = _mm_add_epi64(T4, v41);
  606|  1.16k|        v42 = H2;
  607|  1.16k|        v42 = _mm_mul_epu32(v42, T14);
  608|  1.16k|        T1  = _mm_add_epi64(T1, v14);
  609|  1.16k|        T15 = R23;
  610|  1.16k|        v24 = H0;
  611|  1.16k|        v24 = _mm_mul_epu32(v24, T14);
  612|  1.16k|        T2  = _mm_add_epi64(T2, v23);
  613|  1.16k|        v34 = H0;
  614|  1.16k|        v34 = _mm_mul_epu32(v34, T15);
  615|  1.16k|        T3  = _mm_add_epi64(T3, v33);
  616|  1.16k|        v43 = H1;
  617|  1.16k|        v43 = _mm_mul_epu32(v43, T15);
  618|  1.16k|        T4  = _mm_add_epi64(T4, v42);
  619|  1.16k|        v44 = H0;
  620|  1.16k|        v44 = _mm_mul_epu32(v44, R24);
  621|  1.16k|        T2  = _mm_add_epi64(T2, v24);
  622|  1.16k|        T3  = _mm_add_epi64(T3, v34);
  623|  1.16k|        T4  = _mm_add_epi64(T4, v43);
  624|  1.16k|        T4  = _mm_add_epi64(T4, v44);
  625|       |
  626|       |        /* H += [Mx,My] */
  627|  1.16k|        if (m) {
  ------------------
  |  Branch (627:13): [True: 536, False: 632]
  ------------------
  628|    536|            T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
  629|    536|            T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
  630|    536|            T7 = _mm_unpacklo_epi32(T5, T6);
  631|    536|            T8 = _mm_unpackhi_epi32(T5, T6);
  632|    536|            M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
  633|    536|            M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
  634|    536|            M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
  635|    536|            M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
  636|    536|            M1 = _mm_slli_epi64(M1, 6);
  637|    536|            M2 = _mm_slli_epi64(M2, 12);
  638|    536|            M3 = _mm_slli_epi64(M3, 18);
  639|    536|            T0 = _mm_add_epi64(T0, M0);
  640|    536|            T1 = _mm_add_epi64(T1, M1);
  641|    536|            T2 = _mm_add_epi64(T2, M2);
  642|    536|            T3 = _mm_add_epi64(T3, M3);
  643|    536|            T4 = _mm_add_epi64(T4, HIBIT);
  644|    536|        }
  645|       |
  646|       |        /* reduce */
  647|  1.16k|        C1 = _mm_srli_epi64(T0, 26);
  648|  1.16k|        C2 = _mm_srli_epi64(T3, 26);
  649|  1.16k|        T0 = _mm_and_si128(T0, MMASK);
  650|  1.16k|        T3 = _mm_and_si128(T3, MMASK);
  651|  1.16k|        T1 = _mm_add_epi64(T1, C1);
  652|  1.16k|        T4 = _mm_add_epi64(T4, C2);
  653|  1.16k|        C1 = _mm_srli_epi64(T1, 26);
  654|  1.16k|        C2 = _mm_srli_epi64(T4, 26);
  655|  1.16k|        T1 = _mm_and_si128(T1, MMASK);
  656|  1.16k|        T4 = _mm_and_si128(T4, MMASK);
  657|  1.16k|        T2 = _mm_add_epi64(T2, C1);
  658|  1.16k|        T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  659|  1.16k|        C1 = _mm_srli_epi64(T2, 26);
  660|  1.16k|        C2 = _mm_srli_epi64(T0, 26);
  661|  1.16k|        T2 = _mm_and_si128(T2, MMASK);
  662|  1.16k|        T0 = _mm_and_si128(T0, MMASK);
  663|  1.16k|        T3 = _mm_add_epi64(T3, C1);
  664|  1.16k|        T1 = _mm_add_epi64(T1, C2);
  665|  1.16k|        C1 = _mm_srli_epi64(T3, 26);
  666|  1.16k|        T3 = _mm_and_si128(T3, MMASK);
  667|  1.16k|        T4 = _mm_add_epi64(T4, C1);
  668|       |
  669|       |        /* H = (H*[r^2,r^2] + [Mx,My]) */
  670|  1.16k|        H0 = T0;
  671|  1.16k|        H1 = T1;
  672|  1.16k|        H2 = T2;
  673|  1.16k|        H3 = T3;
  674|  1.16k|        H4 = T4;
  675|  1.16k|    }
  676|       |
  677|  1.58k|    if (m) {
  ------------------
  |  Branch (677:9): [True: 952, False: 632]
  ------------------
  678|    952|        T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
  679|    952|        T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
  680|    952|        T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
  681|    952|        T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
  682|    952|        T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
  683|    952|        T0 = _mm_unpacklo_epi64(T0, T1);
  684|    952|        T1 = _mm_unpacklo_epi64(T2, T3);
  685|    952|        _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
  686|    952|        _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
  687|    952|        _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
  688|    952|    } else {
  689|    632|        uint32_t t0, t1, t2, t3, t4, b;
  690|    632|        uint64_t h0, h1, h2, g0, g1, g2, c, nc;
  691|       |
  692|       |        /* H = H[0]+H[1] */
  693|    632|        T0 = H0;
  694|    632|        T1 = H1;
  695|    632|        T2 = H2;
  696|    632|        T3 = H3;
  697|    632|        T4 = H4;
  698|       |
  699|    632|        T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  700|    632|        T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  701|    632|        T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  702|    632|        T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  703|    632|        T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  704|       |
  705|    632|        t0 = _mm_cvtsi128_si32(T0);
  706|    632|        b  = (t0 >> 26);
  707|    632|        t0 &= 0x3ffffff;
  708|    632|        t1 = _mm_cvtsi128_si32(T1) + b;
  709|    632|        b  = (t1 >> 26);
  710|    632|        t1 &= 0x3ffffff;
  711|    632|        t2 = _mm_cvtsi128_si32(T2) + b;
  712|    632|        b  = (t2 >> 26);
  713|    632|        t2 &= 0x3ffffff;
  714|    632|        t3 = _mm_cvtsi128_si32(T3) + b;
  715|    632|        b  = (t3 >> 26);
  716|    632|        t3 &= 0x3ffffff;
  717|    632|        t4 = _mm_cvtsi128_si32(T4) + b;
  718|       |
  719|       |        /* everything except t4 is in range, so this is all safe */
  720|    632|        h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
  721|    632|        h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
  722|    632|              ((uint64_t) t3 << 34)) &
  723|    632|             0xfffffffffffull;
  724|    632|        h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
  725|       |
  726|    632|        c = (h2 >> 42);
  727|    632|        h2 &= 0x3ffffffffff;
  728|    632|        h0 += c * 5;
  729|    632|        c = (h0 >> 44);
  730|    632|        h0 &= 0xfffffffffff;
  731|    632|        h1 += c;
  732|    632|        c = (h1 >> 44);
  733|    632|        h1 &= 0xfffffffffff;
  734|    632|        h2 += c;
  735|    632|        c = (h2 >> 42);
  736|    632|        h2 &= 0x3ffffffffff;
  737|    632|        h0 += c * 5;
  738|    632|        c = (h0 >> 44);
  739|    632|        h0 &= 0xfffffffffff;
  740|    632|        h1 += c;
  741|       |
  742|    632|        g0 = h0 + 5;
  743|    632|        c  = (g0 >> 44);
  744|    632|        g0 &= 0xfffffffffff;
  745|    632|        g1 = h1 + c;
  746|    632|        c  = (g1 >> 44);
  747|    632|        g1 &= 0xfffffffffff;
  748|    632|        g2 = h2 + c - ((uint64_t) 1 << 42);
  749|       |
  750|    632|        c  = (((g2 >> 61) ^ optblocker_u64) >> 2) - 1;
  751|    632|        nc = ~c;
  752|    632|        h0 = (h0 & nc) | (g0 & c);
  753|    632|        h1 = (h1 & nc) | (g1 & c);
  754|    632|        h2 = (h2 & nc) | (g2 & c);
  755|       |
  756|    632|        st->H.h[0] = h0;
  757|    632|        st->H.h[1] = h1;
  758|    632|        st->H.h[2] = h2;
  759|    632|    }
  760|  1.58k|}
poly1305_sse2.c:poly1305_finish_ext:
  809|    632|{
  810|    632|    uint64_t h0, h1, h2;
  811|       |
  812|    632|    if (leftover) {
  ------------------
  |  Branch (812:9): [True: 466, False: 166]
  ------------------
  813|    466|        CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
  ------------------
  |  |   50|    466|#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
  ------------------
  814|       |
  815|    466|        poly1305_block_copy31(final, m, leftover);
  816|    466|        if (leftover != 16) {
  ------------------
  |  Branch (816:13): [True: 432, False: 34]
  ------------------
  817|    432|            final[leftover] = 1;
  818|    432|        }
  819|    466|        st->flags |=
  820|    466|            (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
  ------------------
  |  Branch (820:13): [True: 252, False: 214]
  ------------------
  821|    466|        poly1305_blocks(st, final, 32);
  822|    466|    }
  823|       |
  824|    632|    if (st->flags & poly1305_started) {
  ------------------
  |  Branch (824:9): [True: 632, False: 0]
  ------------------
  825|       |        /* finalize, H *= [r^2,r], or H *= [r,1] */
  826|    632|        if (!leftover || (leftover > 16)) {
  ------------------
  |  Branch (826:13): [True: 166, False: 466]
  |  Branch (826:26): [True: 218, False: 248]
  ------------------
  827|    384|            st->flags |= poly1305_final_r2_r;
  828|    384|        } else {
  829|    248|            st->flags |= poly1305_final_r_1;
  830|    248|        }
  831|    632|        poly1305_blocks(st, NULL, 32);
  832|    632|    }
  833|       |
  834|    632|    h0 = st->H.h[0];
  835|    632|    h1 = st->H.h[1];
  836|    632|    h2 = st->H.h[2];
  837|       |
  838|       |    /* pad */
  839|    632|    h0 = ((h0) | (h1 << 44));
  840|    632|    h1 = ((h1 >> 20) | (h2 << 24));
  841|    632|#ifdef HAVE_AMD64_ASM
  842|    632|    __asm__ __volatile__(
  843|    632|        "addq %2, %0 ;\n"
  844|    632|        "adcq %3, %1 ;\n"
  845|    632|        : "+r"(h0), "+r"(h1)
  846|    632|        : "r"(st->pad[0]), "r"(st->pad[1])
  847|    632|        : "flags", "cc");
  848|       |#else
  849|       |    {
  850|       |        uint128_t h;
  851|       |
  852|       |        memcpy(&h, &st->pad[0], 16);
  853|       |        h += ((uint128_t) h1 << 64) | h0;
  854|       |        h0 = (uint64_t) h;
  855|       |        h1 = (uint64_t)(h >> 64);
  856|       |    }
  857|       |#endif
  858|    632|    _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
  859|    632|    _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
  860|    632|    _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
  861|    632|    _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
  862|    632|    _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
  863|    632|    _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
  864|    632|    _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
  865|    632|    _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
  866|       |
  867|    632|    memcpy(&mac[0], &h0, 8);
  868|    632|    memcpy(&mac[8], &h1, 8);
  869|       |
  870|    632|    sodium_memzero((void *) st, sizeof *st);
  871|    632|}
poly1305_sse2.c:poly1305_block_copy31:
   77|    466|{
   78|    466|    if (bytes & 16) {
  ------------------
  |  Branch (78:9): [True: 252, False: 214]
  ------------------
   79|    252|        _mm_store_si128((xmmi *) (void *) dst,
   80|    252|                        _mm_loadu_si128((const xmmi *) (const void *) src));
   81|    252|        src += 16;
   82|    252|        dst += 16;
   83|    252|    }
   84|    466|    if (bytes & 8) {
  ------------------
  |  Branch (84:9): [True: 264, False: 202]
  ------------------
   85|    264|        memcpy(dst, src, 8);
   86|    264|        src += 8;
   87|    264|        dst += 8;
   88|    264|    }
   89|    466|    if (bytes & 4) {
  ------------------
  |  Branch (89:9): [True: 276, False: 190]
  ------------------
   90|    276|        memcpy(dst, src, 4);
   91|    276|        src += 4;
   92|    276|        dst += 4;
   93|    276|    }
   94|    466|    if (bytes & 2) {
  ------------------
  |  Branch (94:9): [True: 280, False: 186]
  ------------------
   95|    280|        memcpy(dst, src, 2);
   96|    280|        src += 2;
   97|    280|        dst += 2;
   98|    280|    }
   99|    466|    if (bytes & 1) {
  ------------------
  |  Branch (99:9): [True: 340, False: 126]
  ------------------
  100|    340|        *dst = *src;
  101|    340|    }
  102|    466|}
poly1305_sse2.c:crypto_onetimeauth_poly1305_sse2_verify:
  934|    316|{
  935|    316|    unsigned char correct[16];
  936|       |
  937|    316|    crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
  938|       |
  939|    316|    return crypto_verify_16(h, correct);
  940|    316|}
poly1305_sse2.c:crypto_onetimeauth_poly1305_sse2_init:
  882|    316|{
  883|    316|    COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
  ------------------
  |  |   23|    316|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
  884|    316|                    sizeof(poly1305_state_internal_t));
  885|    316|    poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
  886|       |
  887|    316|    return 0;
  888|    316|}
poly1305_sse2.c:crypto_onetimeauth_poly1305_sse2_update:
  894|    316|{
  895|    316|    poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
  896|       |
  897|    316|    return 0;
  898|    316|}
poly1305_sse2.c:poly1305_update:
  765|    316|{
  766|    316|    unsigned long long i;
  767|       |
  768|       |    /* handle leftover */
  769|    316|    if (st->leftover) {
  ------------------
  |  Branch (769:9): [True: 0, False: 316]
  ------------------
  770|      0|        unsigned long long want = (poly1305_block_size - st->leftover);
  ------------------
  |  |   32|      0|# define poly1305_block_size 32
  ------------------
  771|       |
  772|      0|        if (want > bytes) {
  ------------------
  |  Branch (772:13): [True: 0, False: 0]
  ------------------
  773|      0|            want = bytes;
  774|      0|        }
  775|      0|        for (i = 0; i < want; i++) {
  ------------------
  |  Branch (775:21): [True: 0, False: 0]
  ------------------
  776|      0|            st->buffer[st->leftover + i] = m[i];
  777|      0|        }
  778|      0|        bytes -= want;
  779|      0|        m += want;
  780|      0|        st->leftover += want;
  781|      0|        if (st->leftover < poly1305_block_size) {
  ------------------
  |  |   32|      0|# define poly1305_block_size 32
  ------------------
  |  Branch (781:13): [True: 0, False: 0]
  ------------------
  782|      0|            return;
  783|      0|        }
  784|      0|        poly1305_blocks(st, st->buffer, poly1305_block_size);
  ------------------
  |  |   32|      0|# define poly1305_block_size 32
  ------------------
  785|      0|        st->leftover = 0;
  786|      0|    }
  787|       |
  788|       |    /* process full blocks */
  789|    316|    if (bytes >= poly1305_block_size) {
  ------------------
  |  |   32|    316|# define poly1305_block_size 32
  ------------------
  |  Branch (789:9): [True: 243, False: 73]
  ------------------
  790|    243|        unsigned long long want = (bytes & ~(poly1305_block_size - 1));
  ------------------
  |  |   32|    243|# define poly1305_block_size 32
  ------------------
  791|       |
  792|    243|        poly1305_blocks(st, m, want);
  793|    243|        m += want;
  794|    243|        bytes -= want;
  795|    243|    }
  796|       |
  797|       |    /* store leftover */
  798|    316|    if (bytes) {
  ------------------
  |  Branch (798:9): [True: 233, False: 83]
  ------------------
  799|  4.30k|        for (i = 0; i < bytes; i++) {
  ------------------
  |  Branch (799:21): [True: 4.07k, False: 233]
  ------------------
  800|  4.07k|            st->buffer[st->leftover + i] = m[i];
  801|  4.07k|        }
  802|    233|        st->leftover += bytes;
  803|    233|    }
  804|    316|}
poly1305_sse2.c:crypto_onetimeauth_poly1305_sse2_final:
  903|    316|{
  904|    316|    poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
  905|       |
  906|    316|    return 0;
  907|    316|}
poly1305_sse2.c:poly1305_finish:
  875|    316|{
  876|    316|    poly1305_finish_ext(st, st->buffer, st->leftover, mac);
  877|    316|}

_crypto_pwhash_argon2_pick_best_implementation:
  554|      1|{
  555|      1|    return argon2_pick_best_implementation();
  556|      1|}
argon2-core.c:argon2_pick_best_implementation:
  523|      1|{
  524|       |/* LCOV_EXCL_START */
  525|      1|#if defined(HAVE_AVX512FINTRIN_H) && defined(HAVE_AVX2INTRIN_H) && \
  526|      1|    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) && \
  527|      1|    !defined(__APPLE__)
  528|      1|    if (sodium_runtime_has_avx512f()) {
  ------------------
  |  Branch (528:9): [True: 0, False: 1]
  ------------------
  529|      0|        fill_segment = argon2_fill_segment_avx512f;
  530|      0|        return 0;
  531|      0|    }
  532|      1|#endif
  533|      1|#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
  534|      1|    defined(HAVE_SMMINTRIN_H)
  535|      1|    if (sodium_runtime_has_avx2()) {
  ------------------
  |  Branch (535:9): [True: 1, False: 0]
  ------------------
  536|      1|        fill_segment = argon2_fill_segment_avx2;
  537|      1|        return 0;
  538|      1|    }
  539|      0|#endif
  540|      0|#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
  541|      0|    if (sodium_runtime_has_ssse3()) {
  ------------------
  |  Branch (541:9): [True: 0, False: 0]
  ------------------
  542|      0|        fill_segment = argon2_fill_segment_ssse3;
  543|      0|        return 0;
  544|      0|    }
  545|      0|#endif
  546|      0|    fill_segment = argon2_fill_segment_ref;
  547|       |
  548|      0|    return 0;
  549|       |    /* LCOV_EXCL_STOP */
  550|      0|}

_crypto_scalarmult_curve25519_pick_best_implementation:
   51|      1|{
   52|      1|    implementation = &crypto_scalarmult_curve25519_ref10_implementation;
   53|       |
   54|      1|#ifdef HAVE_AVX_ASM
   55|      1|    if (sodium_runtime_has_avx()) {
  ------------------
  |  Branch (55:9): [True: 1, False: 0]
  ------------------
   56|      1|        implementation = &crypto_scalarmult_curve25519_sandy2x_implementation;
   57|      1|    }
   58|      1|#endif
   59|      1|    return 0;
   60|      1|}

crypto_secretbox_keygen:
   65|    316|{
   66|    316|    randombytes_buf(k, crypto_secretbox_KEYBYTES);
  ------------------
  |  |   16|    316|#define crypto_secretbox_KEYBYTES crypto_secretbox_xsalsa20poly1305_KEYBYTES
  |  |  ------------------
  |  |  |  |   15|    316|#define crypto_secretbox_xsalsa20poly1305_KEYBYTES 32U
  |  |  ------------------
  ------------------
   67|    316|}

crypto_secretbox_detached:
   21|    316|{
   22|    316|    crypto_onetimeauth_poly1305_state state;
   23|    316|    unsigned char                     block0[64U];
   24|    316|    unsigned char                     subkey[crypto_stream_salsa20_KEYBYTES];
   25|    316|    unsigned long long                i;
   26|    316|    unsigned long long                mlen0;
   27|       |
   28|    316|    crypto_core_hsalsa20(subkey, n, k, NULL);
   29|       |
   30|    316|    if (((uintptr_t) c > (uintptr_t) m &&
  ------------------
  |  Branch (30:10): [True: 306, False: 10]
  ------------------
   31|    316|         (uintptr_t) c - (uintptr_t) m < mlen) ||
  ------------------
  |  Branch (31:10): [True: 0, False: 306]
  ------------------
   32|    316|        ((uintptr_t) m > (uintptr_t) c &&
  ------------------
  |  Branch (32:10): [True: 10, False: 306]
  ------------------
   33|    316|         (uintptr_t) m - (uintptr_t) c < mlen)) { /* LCOV_EXCL_LINE */
  ------------------
  |  Branch (33:10): [True: 0, False: 10]
  ------------------
   34|      0|        memmove(c, m, mlen);
   35|      0|        m = c;
   36|      0|    }
   37|    316|    memset(block0, 0U, crypto_secretbox_ZEROBYTES);
  ------------------
  |  |   70|    316|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    316|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    316|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    316|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   38|    316|    COMPILER_ASSERT(64U >= crypto_secretbox_ZEROBYTES);
  ------------------
  |  |   23|    316|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
   39|    316|    mlen0 = mlen;
   40|    316|    if (mlen0 > 64U - crypto_secretbox_ZEROBYTES) {
  ------------------
  |  |   70|    316|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    316|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    316|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    316|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (40:9): [True: 242, False: 74]
  ------------------
   41|    242|        mlen0 = 64U - crypto_secretbox_ZEROBYTES;
  ------------------
  |  |   70|    242|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    242|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    242|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    242|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    242|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   42|    242|    }
   43|  8.84k|    for (i = 0U; i < mlen0; i++) {
  ------------------
  |  Branch (43:18): [True: 8.53k, False: 316]
  ------------------
   44|  8.53k|        block0[i + crypto_secretbox_ZEROBYTES] = m[i];
  ------------------
  |  |   70|  8.53k|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|  8.53k|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|  8.53k|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|  8.53k|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  8.53k|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   45|  8.53k|    }
   46|    316|    crypto_stream_salsa20_xor(block0, block0,
   47|    316|                              mlen0 + crypto_secretbox_ZEROBYTES,
  ------------------
  |  |   70|    316|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    316|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    316|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    316|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   48|    316|                              n + 16, subkey);
   49|    316|    COMPILER_ASSERT(crypto_secretbox_ZEROBYTES >=
  ------------------
  |  |   23|    316|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
   50|    316|                    crypto_onetimeauth_poly1305_KEYBYTES);
   51|    316|    crypto_onetimeauth_poly1305_init(&state, block0);
   52|       |
   53|  8.84k|    for (i = 0U; i < mlen0; i++) {
  ------------------
  |  Branch (53:18): [True: 8.53k, False: 316]
  ------------------
   54|  8.53k|        c[i] = block0[crypto_secretbox_ZEROBYTES + i];
  ------------------
  |  |   70|  8.53k|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|  8.53k|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|  8.53k|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|  8.53k|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  8.53k|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   55|  8.53k|    }
   56|    316|    sodium_memzero(block0, sizeof block0);
   57|    316|    if (mlen > mlen0) {
  ------------------
  |  Branch (57:9): [True: 242, False: 74]
  ------------------
   58|    242|        crypto_stream_salsa20_xor_ic(c + mlen0, m + mlen0, mlen - mlen0,
   59|    242|                                     n + 16, 1U, subkey);
   60|    242|    }
   61|    316|    sodium_memzero(subkey, sizeof subkey);
   62|       |
   63|    316|    crypto_onetimeauth_poly1305_update(&state, c, mlen);
   64|    316|    crypto_onetimeauth_poly1305_final(&state, mac);
   65|    316|    sodium_memzero(&state, sizeof state);
   66|       |
   67|    316|    return 0;
   68|    316|}
crypto_secretbox_easy:
   74|    316|{
   75|    316|    if (mlen > crypto_secretbox_MESSAGEBYTES_MAX) {
  ------------------
  |  |   32|    316|#define crypto_secretbox_MESSAGEBYTES_MAX crypto_secretbox_xsalsa20poly1305_MESSAGEBYTES_MAX
  |  |  ------------------
  |  |  |  |   29|    316|    (crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|    316|#define crypto_stream_xsalsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|    316|#define SODIUM_SIZE_MAX SODIUM_MIN(UINT64_MAX, SIZE_MAX)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   54|    316|#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  Branch (54:27): [Folded - Ignored]
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |                   (crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (75:9): [True: 0, False: 316]
  ------------------
   76|      0|        sodium_misuse();
   77|      0|    }
   78|    316|    return crypto_secretbox_detached(c + crypto_secretbox_MACBYTES,
  ------------------
  |  |   24|    316|#define crypto_secretbox_MACBYTES crypto_secretbox_xsalsa20poly1305_MACBYTES
  |  |  ------------------
  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  ------------------
  ------------------
   79|    316|                                     c, m, mlen, n, k);
   80|    316|}
crypto_secretbox_open_detached:
   88|    316|{
   89|    316|    unsigned char      block0[64U];
   90|    316|    unsigned char      subkey[crypto_stream_salsa20_KEYBYTES];
   91|    316|    unsigned long long i;
   92|    316|    unsigned long long mlen0;
   93|       |
   94|    316|    crypto_core_hsalsa20(subkey, n, k, NULL);
   95|    316|    crypto_stream_salsa20(block0, crypto_stream_salsa20_KEYBYTES,
  ------------------
  |  |   23|    316|#define crypto_stream_salsa20_KEYBYTES 32U
  ------------------
   96|    316|                          n + 16, subkey);
   97|    316|    if (crypto_onetimeauth_poly1305_verify(mac, c, clen, block0) != 0) {
  ------------------
  |  Branch (97:9): [True: 0, False: 316]
  ------------------
   98|      0|        sodium_memzero(subkey, sizeof subkey);
   99|      0|        return -1;
  100|      0|    }
  101|    316|    if (m == NULL) {
  ------------------
  |  Branch (101:9): [True: 0, False: 316]
  ------------------
  102|      0|        return 0;
  103|      0|    }
  104|    316|    if (((uintptr_t) c > (uintptr_t) m &&
  ------------------
  |  Branch (104:10): [True: 71, False: 245]
  ------------------
  105|    316|         (uintptr_t) c - (uintptr_t) m < clen) ||
  ------------------
  |  Branch (105:10): [True: 0, False: 71]
  ------------------
  106|    316|        ((uintptr_t) m > (uintptr_t) c &&
  ------------------
  |  Branch (106:10): [True: 245, False: 71]
  ------------------
  107|    316|         (uintptr_t) m - (uintptr_t) c < clen)) { /* LCOV_EXCL_LINE */
  ------------------
  |  Branch (107:10): [True: 0, False: 245]
  ------------------
  108|      0|        memmove(m, c, clen);
  109|      0|        c = m;
  110|      0|    }
  111|    316|    mlen0 = clen;
  112|    316|    if (mlen0 > 64U - crypto_secretbox_ZEROBYTES) {
  ------------------
  |  |   70|    316|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    316|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    316|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    316|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (112:9): [True: 242, False: 74]
  ------------------
  113|    242|        mlen0 = 64U - crypto_secretbox_ZEROBYTES;
  ------------------
  |  |   70|    242|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    242|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    242|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    242|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    242|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  114|    242|    }
  115|  8.84k|    for (i = 0U; i < mlen0; i++) {
  ------------------
  |  Branch (115:18): [True: 8.53k, False: 316]
  ------------------
  116|  8.53k|        block0[crypto_secretbox_ZEROBYTES + i] = c[i];
  ------------------
  |  |   70|  8.53k|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|  8.53k|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|  8.53k|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|  8.53k|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  8.53k|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  117|  8.53k|    }
  118|    316|    crypto_stream_salsa20_xor(block0, block0,
  119|    316|                              crypto_secretbox_ZEROBYTES + mlen0,
  ------------------
  |  |   70|    316|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|    316|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|    316|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|    316|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  120|    316|                              n + 16, subkey);
  121|  8.84k|    for (i = 0U; i < mlen0; i++) {
  ------------------
  |  Branch (121:18): [True: 8.53k, False: 316]
  ------------------
  122|  8.53k|        m[i] = block0[i + crypto_secretbox_ZEROBYTES];
  ------------------
  |  |   70|  8.53k|#define crypto_secretbox_ZEROBYTES crypto_secretbox_xsalsa20poly1305_ZEROBYTES
  |  |  ------------------
  |  |  |  |   60|  8.53k|    (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   55|  8.53k|#define crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES 16U
  |  |  |  |  ------------------
  |  |  |  |   61|  8.53k|     crypto_secretbox_xsalsa20poly1305_MACBYTES)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  8.53k|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  123|  8.53k|    }
  124|    316|    sodium_memzero(block0, sizeof block0);
  125|    316|    if (clen > mlen0) {
  ------------------
  |  Branch (125:9): [True: 242, False: 74]
  ------------------
  126|    242|        crypto_stream_salsa20_xor_ic(m + mlen0, c + mlen0, clen - mlen0,
  127|    242|                                     n + 16, 1U, subkey);
  128|    242|    }
  129|    316|    sodium_memzero(subkey, sizeof subkey);
  130|       |
  131|    316|    return 0;
  132|    316|}
crypto_secretbox_open_easy:
  138|    316|{
  139|    316|    if (clen < crypto_secretbox_MACBYTES) {
  ------------------
  |  |   24|    316|#define crypto_secretbox_MACBYTES crypto_secretbox_xsalsa20poly1305_MACBYTES
  |  |  ------------------
  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  ------------------
  ------------------
  |  Branch (139:9): [True: 0, False: 316]
  ------------------
  140|      0|        return -1;
  141|      0|    }
  142|    316|    return crypto_secretbox_open_detached(m, c + crypto_secretbox_MACBYTES, c,
  ------------------
  |  |   24|    316|#define crypto_secretbox_MACBYTES crypto_secretbox_xsalsa20poly1305_MACBYTES
  |  |  ------------------
  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  ------------------
  ------------------
  143|    316|                                          clen - crypto_secretbox_MACBYTES,
  ------------------
  |  |   24|    316|#define crypto_secretbox_MACBYTES crypto_secretbox_xsalsa20poly1305_MACBYTES
  |  |  ------------------
  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  ------------------
  ------------------
  144|    316|                                          n, k);
  145|    316|}

chacha20_dolbeau-avx2.c:chacha_keysetup:
   37|    632|{
   38|    632|    ctx->input[0]  = 0x61707865;
   39|    632|    ctx->input[1]  = 0x3320646e;
   40|    632|    ctx->input[2]  = 0x79622d32;
   41|    632|    ctx->input[3]  = 0x6b206574;
   42|    632|    ctx->input[4]  = LOAD32_LE(k + 0);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   43|    632|    ctx->input[5]  = LOAD32_LE(k + 4);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   44|    632|    ctx->input[6]  = LOAD32_LE(k + 8);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   45|    632|    ctx->input[7]  = LOAD32_LE(k + 12);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   46|    632|    ctx->input[8]  = LOAD32_LE(k + 16);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   47|    632|    ctx->input[9]  = LOAD32_LE(k + 20);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   48|    632|    ctx->input[10] = LOAD32_LE(k + 24);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   49|    632|    ctx->input[11] = LOAD32_LE(k + 28);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   50|    632|}
chacha20_dolbeau-avx2.c:chacha20_encrypt_bytes:
   73|    632|{
   74|    632|    uint32_t * const x = &ctx->input[0];
   75|       |
   76|    632|    if (!bytes) {
  ------------------
  |  Branch (76:9): [True: 0, False: 632]
  ------------------
   77|      0|        return; /* LCOV_EXCL_LINE */
   78|      0|    }
   79|    632|# include "u8.h"
  ------------------
  |  |    1|    632|
  |  |    2|    632|#define VEC8_ROT(A, IMM) \
  |  |    3|    632|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |    4|       |
  |  |    5|       |/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
  |  |    6|       | * 16) (better) */
  |  |    7|    632|#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)  \
  |  |    8|    632|    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
  |  |    9|    632|    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
  |  |   10|    632|    x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
  |  |   11|    632|    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
  |  |   12|    632|    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
  |  |   13|    632|    x_##B = VEC8_ROT(t_##C, 12);               \
  |  |   14|    632|    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
  |  |   15|    632|    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
  |  |   16|    632|    x_##D = _mm256_shuffle_epi8(t_##A, rot8);  \
  |  |   17|    632|    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
  |  |   18|    632|    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
  |  |   19|    632|    x_##B = VEC8_ROT(t_##C, 7)
  |  |   20|       |
  |  |   21|    632|#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |   22|       |
  |  |   23|    632|#define VEC8_LINE1(A, B, C, D)              \
  |  |   24|    632|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |   25|    632|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |   26|    632|#define VEC8_LINE2(A, B, C, D)              \
  |  |   27|    632|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |   28|    632|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |   29|    632|#define VEC8_LINE3(A, B, C, D)              \
  |  |   30|    632|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |   31|    632|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |   32|    632|#define VEC8_LINE4(A, B, C, D)              \
  |  |   33|    632|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |   34|    632|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |   35|       |
  |  |   36|    632|#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
  |  |   37|    632|                       C4, D4)                                                 \
  |  |   38|    632|    VEC8_LINE1(A1, B1, C1, D1);                                                \
  |  |   39|    632|    VEC8_LINE1(A2, B2, C2, D2);                                                \
  |  |   40|    632|    VEC8_LINE1(A3, B3, C3, D3);                                                \
  |  |   41|    632|    VEC8_LINE1(A4, B4, C4, D4);                                                \
  |  |   42|    632|    VEC8_LINE2(A1, B1, C1, D1);                                                \
  |  |   43|    632|    VEC8_LINE2(A2, B2, C2, D2);                                                \
  |  |   44|    632|    VEC8_LINE2(A3, B3, C3, D3);                                                \
  |  |   45|    632|    VEC8_LINE2(A4, B4, C4, D4);                                                \
  |  |   46|    632|    VEC8_LINE3(A1, B1, C1, D1);                                                \
  |  |   47|    632|    VEC8_LINE3(A2, B2, C2, D2);                                                \
  |  |   48|    632|    VEC8_LINE3(A3, B3, C3, D3);                                                \
  |  |   49|    632|    VEC8_LINE3(A4, B4, C4, D4);                                                \
  |  |   50|    632|    VEC8_LINE4(A1, B1, C1, D1);                                                \
  |  |   51|    632|    VEC8_LINE4(A2, B2, C2, D2);                                                \
  |  |   52|    632|    VEC8_LINE4(A3, B3, C3, D3);                                                \
  |  |   53|    632|    VEC8_LINE4(A4, B4, C4, D4)
  |  |   54|       |
  |  |   55|    632|#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
  |  |   56|    632|                        B4, C4, D4)                                         \
  |  |   57|    632|    VEC8_LINE1(A1, B1, C1, D1);                                             \
  |  |   58|    632|    VEC8_LINE1(A2, B2, C2, D2);                                             \
  |  |   59|    632|    VEC8_LINE2(A1, B1, C1, D1);                                             \
  |  |   60|    632|    VEC8_LINE2(A2, B2, C2, D2);                                             \
  |  |   61|    632|    VEC8_LINE3(A1, B1, C1, D1);                                             \
  |  |   62|    632|    VEC8_LINE3(A2, B2, C2, D2);                                             \
  |  |   63|    632|    VEC8_LINE4(A1, B1, C1, D1);                                             \
  |  |   64|    632|    VEC8_LINE4(A2, B2, C2, D2);                                             \
  |  |   65|    632|    VEC8_LINE1(A3, B3, C3, D3);                                             \
  |  |   66|    632|    VEC8_LINE1(A4, B4, C4, D4);                                             \
  |  |   67|    632|    VEC8_LINE2(A3, B3, C3, D3);                                             \
  |  |   68|    632|    VEC8_LINE2(A4, B4, C4, D4);                                             \
  |  |   69|    632|    VEC8_LINE3(A3, B3, C3, D3);                                             \
  |  |   70|    632|    VEC8_LINE3(A4, B4, C4, D4);                                             \
  |  |   71|    632|    VEC8_LINE4(A3, B3, C3, D3);                                             \
  |  |   72|    632|    VEC8_LINE4(A4, B4, C4, D4)
  |  |   73|       |
  |  |   74|    632|#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
  |  |   75|    632|                               A4, B4, C4, D4)                                 \
  |  |   76|    632|    VEC8_LINE1(A1, B1, C1, D1);                                                \
  |  |   77|    632|    VEC8_LINE1(A2, B2, C2, D2);                                                \
  |  |   78|    632|    VEC8_LINE2(A1, B1, C1, D1);                                                \
  |  |   79|    632|    VEC8_LINE2(A2, B2, C2, D2);                                                \
  |  |   80|    632|    VEC8_LINE1(A3, B3, C3, D3);                                                \
  |  |   81|    632|    VEC8_LINE1(A4, B4, C4, D4);                                                \
  |  |   82|    632|    VEC8_LINE2(A3, B3, C3, D3);                                                \
  |  |   83|    632|    VEC8_LINE2(A4, B4, C4, D4);                                                \
  |  |   84|    632|    VEC8_LINE3(A1, B1, C1, D1);                                                \
  |  |   85|    632|    VEC8_LINE3(A2, B2, C2, D2);                                                \
  |  |   86|    632|    VEC8_LINE4(A1, B1, C1, D1);                                                \
  |  |   87|    632|    VEC8_LINE4(A2, B2, C2, D2);                                                \
  |  |   88|    632|    VEC8_LINE3(A3, B3, C3, D3);                                                \
  |  |   89|    632|    VEC8_LINE3(A4, B4, C4, D4);                                                \
  |  |   90|    632|    VEC8_LINE4(A3, B3, C3, D3);                                                \
  |  |   91|    632|    VEC8_LINE4(A4, B4, C4, D4)
  |  |   92|       |
  |  |   93|    632|#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
  |  |   94|    632|                   D4)                                                         \
  |  |   95|    632|    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
  |  |   96|    632|                   D4)
  |  |   97|       |
  |  |   98|    632|if (bytes >= 512) {
  |  |  ------------------
  |  |  |  Branch (98:5): [True: 0, False: 632]
  |  |  ------------------
  |  |   99|       |    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
  |  |  100|      0|    __m256i rot16 =
  |  |  101|      0|        _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
  |  |  102|      0|                        13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
  |  |  103|      0|    __m256i rot8 =
  |  |  104|      0|        _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
  |  |  105|      0|                        14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
  |  |  106|      0|    uint32_t in12, in13;
  |  |  107|       |
  |  |  108|       |    /* the naive way seems as fast (if not a bit faster) than the vector way */
  |  |  109|      0|    __m256i x_0  = _mm256_set1_epi32(x[0]);
  |  |  110|      0|    __m256i x_1  = _mm256_set1_epi32(x[1]);
  |  |  111|      0|    __m256i x_2  = _mm256_set1_epi32(x[2]);
  |  |  112|      0|    __m256i x_3  = _mm256_set1_epi32(x[3]);
  |  |  113|      0|    __m256i x_4  = _mm256_set1_epi32(x[4]);
  |  |  114|      0|    __m256i x_5  = _mm256_set1_epi32(x[5]);
  |  |  115|      0|    __m256i x_6  = _mm256_set1_epi32(x[6]);
  |  |  116|      0|    __m256i x_7  = _mm256_set1_epi32(x[7]);
  |  |  117|      0|    __m256i x_8  = _mm256_set1_epi32(x[8]);
  |  |  118|      0|    __m256i x_9  = _mm256_set1_epi32(x[9]);
  |  |  119|      0|    __m256i x_10 = _mm256_set1_epi32(x[10]);
  |  |  120|      0|    __m256i x_11 = _mm256_set1_epi32(x[11]);
  |  |  121|      0|    __m256i x_12;
  |  |  122|      0|    __m256i x_13;
  |  |  123|      0|    __m256i x_14 = _mm256_set1_epi32(x[14]);
  |  |  124|      0|    __m256i x_15 = _mm256_set1_epi32(x[15]);
  |  |  125|       |
  |  |  126|      0|    __m256i orig0  = x_0;
  |  |  127|      0|    __m256i orig1  = x_1;
  |  |  128|      0|    __m256i orig2  = x_2;
  |  |  129|      0|    __m256i orig3  = x_3;
  |  |  130|      0|    __m256i orig4  = x_4;
  |  |  131|      0|    __m256i orig5  = x_5;
  |  |  132|      0|    __m256i orig6  = x_6;
  |  |  133|      0|    __m256i orig7  = x_7;
  |  |  134|      0|    __m256i orig8  = x_8;
  |  |  135|      0|    __m256i orig9  = x_9;
  |  |  136|      0|    __m256i orig10 = x_10;
  |  |  137|      0|    __m256i orig11 = x_11;
  |  |  138|      0|    __m256i orig12;
  |  |  139|      0|    __m256i orig13;
  |  |  140|      0|    __m256i orig14 = x_14;
  |  |  141|      0|    __m256i orig15 = x_15;
  |  |  142|      0|    __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
  |  |  143|      0|        t_13, t_14, t_15;
  |  |  144|       |
  |  |  145|      0|    while (bytes >= 512) {
  |  |  ------------------
  |  |  |  Branch (145:12): [True: 0, False: 0]
  |  |  ------------------
  |  |  146|      0|        const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
  |  |  147|      0|        const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
  |  |  148|      0|        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  |  |  149|      0|        __m256i       t12, t13;
  |  |  150|       |
  |  |  151|      0|        uint64_t in1213;
  |  |  152|      0|        int      i;
  |  |  153|       |
  |  |  154|      0|        x_0  = orig0;
  |  |  155|      0|        x_1  = orig1;
  |  |  156|      0|        x_2  = orig2;
  |  |  157|      0|        x_3  = orig3;
  |  |  158|      0|        x_4  = orig4;
  |  |  159|      0|        x_5  = orig5;
  |  |  160|      0|        x_6  = orig6;
  |  |  161|      0|        x_7  = orig7;
  |  |  162|      0|        x_8  = orig8;
  |  |  163|      0|        x_9  = orig9;
  |  |  164|      0|        x_10 = orig10;
  |  |  165|      0|        x_11 = orig11;
  |  |  166|      0|        x_14 = orig14;
  |  |  167|      0|        x_15 = orig15;
  |  |  168|       |
  |  |  169|      0|        in12   = x[12];
  |  |  170|      0|        in13   = x[13];
  |  |  171|      0|        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
  |  |  172|      0|        x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
  |  |  173|       |
  |  |  174|      0|        t12 = _mm256_add_epi64(addv12, x_12);
  |  |  175|      0|        t13 = _mm256_add_epi64(addv13, x_13);
  |  |  176|       |
  |  |  177|      0|        x_12 = _mm256_unpacklo_epi32(t12, t13);
  |  |  178|      0|        x_13 = _mm256_unpackhi_epi32(t12, t13);
  |  |  179|       |
  |  |  180|      0|        t12 = _mm256_unpacklo_epi32(x_12, x_13);
  |  |  181|      0|        t13 = _mm256_unpackhi_epi32(x_12, x_13);
  |  |  182|       |
  |  |  183|       |        /* required because unpack* are intra-lane */
  |  |  184|      0|        x_12 = _mm256_permutevar8x32_epi32(t12, permute);
  |  |  185|      0|        x_13 = _mm256_permutevar8x32_epi32(t13, permute);
  |  |  186|       |
  |  |  187|      0|        orig12 = x_12;
  |  |  188|      0|        orig13 = x_13;
  |  |  189|       |
  |  |  190|      0|        in1213 += 8;
  |  |  191|       |
  |  |  192|      0|        x[12] = in1213 & 0xFFFFFFFF;
  |  |  193|      0|        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
  |  |  194|       |
  |  |  195|      0|        for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   29|      0|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (195:21): [True: 0, False: 0]
  |  |  ------------------
  |  |  196|      0|            VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
  |  |  ------------------
  |  |  |  |   95|      0|    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    VEC8_LINE1(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   39|      0|    VEC8_LINE1(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      0|    VEC8_LINE1(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   41|      0|    VEC8_LINE1(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   42|      0|    VEC8_LINE2(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   43|      0|    VEC8_LINE2(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   44|      0|    VEC8_LINE2(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   45|      0|    VEC8_LINE2(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   46|      0|    VEC8_LINE3(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   47|      0|    VEC8_LINE3(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   48|      0|    VEC8_LINE3(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   49|      0|    VEC8_LINE3(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   50|      0|    VEC8_LINE4(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   51|      0|    VEC8_LINE4(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|    VEC8_LINE4(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   53|      0|    VEC8_LINE4(A4, B4, C4, D4)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|                   D4)
  |  |  ------------------
  |  |  197|      0|            VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
  |  |  ------------------
  |  |  |  |   95|      0|    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    VEC8_LINE1(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   39|      0|    VEC8_LINE1(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      0|    VEC8_LINE1(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   41|      0|    VEC8_LINE1(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   24|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   25|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   42|      0|    VEC8_LINE2(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   43|      0|    VEC8_LINE2(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   44|      0|    VEC8_LINE2(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   45|      0|    VEC8_LINE2(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   27|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   28|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   46|      0|    VEC8_LINE3(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   47|      0|    VEC8_LINE3(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   48|      0|    VEC8_LINE3(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   49|      0|    VEC8_LINE3(A4, B4, C4, D4);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   30|      0|    x_##A = _mm256_add_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  |  |   31|      0|    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   50|      0|    VEC8_LINE4(A1, B1, C1, D1);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   51|      0|    VEC8_LINE4(A2, B2, C2, D2);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|    VEC8_LINE4(A3, B3, C3, D3);                                                \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   53|      0|    VEC8_LINE4(A4, B4, C4, D4)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|      0|    x_##C = _mm256_add_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  |  |   34|      0|    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |    3|      0|    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|                   D4)
  |  |  ------------------
  |  |  198|      0|        }
  |  |  199|       |
  |  |  200|      0|#define ONEQUAD_TRANSPOSE(A, B, C, D)                                     \
  |  |  201|      0|    {                                                                     \
  |  |  202|      0|        __m128i t0, t1, t2, t3;                                           \
  |  |  203|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);                         \
  |  |  204|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);                         \
  |  |  205|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);                         \
  |  |  206|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);                         \
  |  |  207|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);                      \
  |  |  208|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);                      \
  |  |  209|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);                      \
  |  |  210|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);                      \
  |  |  211|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);                      \
  |  |  212|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);                      \
  |  |  213|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);                      \
  |  |  214|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);                      \
  |  |  215|      0|        t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),         \
  |  |  216|      0|                           _mm_loadu_si128((const __m128i*) (m + 0))); \
  |  |  217|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                         \
  |  |  218|      0|        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),            \
  |  |  219|      0|                           _mm_loadu_si128((const __m128i*) (m + 64)));   \
  |  |  220|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                        \
  |  |  221|      0|        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),            \
  |  |  222|      0|                           _mm_loadu_si128((const __m128i*) (m + 128)));  \
  |  |  223|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                       \
  |  |  224|      0|        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),            \
  |  |  225|      0|                           _mm_loadu_si128((const __m128i*) (m + 192)));  \
  |  |  226|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                       \
  |  |  227|      0|        t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),            \
  |  |  228|      0|                           _mm_loadu_si128((const __m128i*) (m + 256)));  \
  |  |  229|      0|        _mm_storeu_si128((__m128i*) (c + 256), t0);                       \
  |  |  230|      0|        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),            \
  |  |  231|      0|                           _mm_loadu_si128((const __m128i*) (m + 320)));  \
  |  |  232|      0|        _mm_storeu_si128((__m128i*) (c + 320), t1);                       \
  |  |  233|      0|        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),            \
  |  |  234|      0|                           _mm_loadu_si128((const __m128i*) (m + 384)));  \
  |  |  235|      0|        _mm_storeu_si128((__m128i*) (c + 384), t2);                       \
  |  |  236|      0|        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),            \
  |  |  237|      0|                           _mm_loadu_si128((const __m128i*) (m + 448)));  \
  |  |  238|      0|        _mm_storeu_si128((__m128i*) (c + 448), t3);                       \
  |  |  239|      0|    }
  |  |  240|       |
  |  |  241|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  242|       |
  |  |  243|      0|#define ONEQUAD_UNPCK(A, B, C, D)                    \
  |  |  244|      0|    {                                                \
  |  |  245|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
  |  |  246|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
  |  |  247|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
  |  |  248|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
  |  |  249|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
  |  |  250|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
  |  |  251|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
  |  |  252|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
  |  |  253|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
  |  |  254|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
  |  |  255|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
  |  |  256|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
  |  |  257|      0|    }
  |  |  258|       |
  |  |  259|      0|#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                          \
  |  |  260|      0|    {                                                                \
  |  |  261|      0|        ONEQUAD_UNPCK(A, B, C, D);                                   \
  |  |  262|      0|        ONEQUAD_UNPCK(A2, B2, C2, D2);                               \
  |  |  263|      0|        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);     \
  |  |  264|      0|        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);     \
  |  |  265|      0|        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);     \
  |  |  266|      0|        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);     \
  |  |  267|      0|        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);     \
  |  |  268|      0|        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);     \
  |  |  269|      0|        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);     \
  |  |  270|      0|        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);     \
  |  |  271|      0|        t_##A  = _mm256_xor_si256(                                   \
  |  |  272|      0|            t_##A, _mm256_loadu_si256((const __m256i*) (m + 0)));   \
  |  |  273|      0|        t_##B = _mm256_xor_si256(                                    \
  |  |  274|      0|            t_##B, _mm256_loadu_si256((const __m256i*) (m + 64)));   \
  |  |  275|      0|        t_##C = _mm256_xor_si256(                                    \
  |  |  276|      0|            t_##C, _mm256_loadu_si256((const __m256i*) (m + 128)));  \
  |  |  277|      0|        t_##D = _mm256_xor_si256(                                    \
  |  |  278|      0|            t_##D, _mm256_loadu_si256((const __m256i*) (m + 192)));  \
  |  |  279|      0|        t_##A2 = _mm256_xor_si256(                                   \
  |  |  280|      0|            t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
  |  |  281|      0|        t_##B2 = _mm256_xor_si256(                                   \
  |  |  282|      0|            t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
  |  |  283|      0|        t_##C2 = _mm256_xor_si256(                                   \
  |  |  284|      0|            t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
  |  |  285|      0|        t_##D2 = _mm256_xor_si256(                                   \
  |  |  286|      0|            t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
  |  |  287|      0|        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);              \
  |  |  288|      0|        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);             \
  |  |  289|      0|        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);            \
  |  |  290|      0|        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);            \
  |  |  291|      0|        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);           \
  |  |  292|      0|        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);           \
  |  |  293|      0|        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);           \
  |  |  294|      0|        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);           \
  |  |  295|      0|    }
  |  |  296|       |
  |  |  297|      0|        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
  |  |  ------------------
  |  |  |  |  260|      0|    {                                                                \
  |  |  |  |  261|      0|        ONEQUAD_UNPCK(A, B, C, D);                                   \
  |  |  |  |  ------------------
  |  |  |  |  |  |  244|      0|    {                                                \
  |  |  |  |  |  |  245|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
  |  |  |  |  |  |  246|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
  |  |  |  |  |  |  247|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
  |  |  |  |  |  |  248|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
  |  |  |  |  |  |  249|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  250|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  251|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  252|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  253|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  254|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  255|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  256|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  257|      0|    }
  |  |  |  |  ------------------
  |  |  |  |  262|      0|        ONEQUAD_UNPCK(A2, B2, C2, D2);                               \
  |  |  |  |  ------------------
  |  |  |  |  |  |  244|      0|    {                                                \
  |  |  |  |  |  |  245|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
  |  |  |  |  |  |  246|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
  |  |  |  |  |  |  247|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
  |  |  |  |  |  |  248|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
  |  |  |  |  |  |  249|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  250|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  251|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  252|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  253|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  254|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  255|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  256|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  257|      0|    }
  |  |  |  |  ------------------
  |  |  |  |  263|      0|        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);     \
  |  |  |  |  264|      0|        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);     \
  |  |  |  |  265|      0|        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);     \
  |  |  |  |  266|      0|        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);     \
  |  |  |  |  267|      0|        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);     \
  |  |  |  |  268|      0|        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);     \
  |  |  |  |  269|      0|        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);     \
  |  |  |  |  270|      0|        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);     \
  |  |  |  |  271|      0|        t_##A  = _mm256_xor_si256(                                   \
  |  |  |  |  272|      0|            t_##A, _mm256_loadu_si256((const __m256i*) (m + 0)));   \
  |  |  |  |  273|      0|        t_##B = _mm256_xor_si256(                                    \
  |  |  |  |  274|      0|            t_##B, _mm256_loadu_si256((const __m256i*) (m + 64)));   \
  |  |  |  |  275|      0|        t_##C = _mm256_xor_si256(                                    \
  |  |  |  |  276|      0|            t_##C, _mm256_loadu_si256((const __m256i*) (m + 128)));  \
  |  |  |  |  277|      0|        t_##D = _mm256_xor_si256(                                    \
  |  |  |  |  278|      0|            t_##D, _mm256_loadu_si256((const __m256i*) (m + 192)));  \
  |  |  |  |  279|      0|        t_##A2 = _mm256_xor_si256(                                   \
  |  |  |  |  280|      0|            t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
  |  |  |  |  281|      0|        t_##B2 = _mm256_xor_si256(                                   \
  |  |  |  |  282|      0|            t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
  |  |  |  |  283|      0|        t_##C2 = _mm256_xor_si256(                                   \
  |  |  |  |  284|      0|            t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
  |  |  |  |  285|      0|        t_##D2 = _mm256_xor_si256(                                   \
  |  |  |  |  286|      0|            t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
  |  |  |  |  287|      0|        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);              \
  |  |  |  |  288|      0|        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);             \
  |  |  |  |  289|      0|        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);            \
  |  |  |  |  290|      0|        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);            \
  |  |  |  |  291|      0|        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);           \
  |  |  |  |  292|      0|        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);           \
  |  |  |  |  293|      0|        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);           \
  |  |  |  |  294|      0|        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);           \
  |  |  |  |  295|      0|    }
  |  |  ------------------
  |  |  298|      0|        m += 32;
  |  |  299|      0|        c += 32;
  |  |  300|      0|        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
  |  |  ------------------
  |  |  |  |  260|      0|    {                                                                \
  |  |  |  |  261|      0|        ONEQUAD_UNPCK(A, B, C, D);                                   \
  |  |  |  |  ------------------
  |  |  |  |  |  |  244|      0|    {                                                \
  |  |  |  |  |  |  245|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
  |  |  |  |  |  |  246|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
  |  |  |  |  |  |  247|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
  |  |  |  |  |  |  248|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
  |  |  |  |  |  |  249|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  250|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  251|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  252|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  253|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  254|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  255|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  256|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  257|      0|    }
  |  |  |  |  ------------------
  |  |  |  |  262|      0|        ONEQUAD_UNPCK(A2, B2, C2, D2);                               \
  |  |  |  |  ------------------
  |  |  |  |  |  |  244|      0|    {                                                \
  |  |  |  |  |  |  245|      0|        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
  |  |  |  |  |  |  246|      0|        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
  |  |  |  |  |  |  247|      0|        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
  |  |  |  |  |  |  248|      0|        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
  |  |  |  |  |  |  249|      0|        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  250|      0|        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  251|      0|        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
  |  |  |  |  |  |  252|      0|        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
  |  |  |  |  |  |  253|      0|        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  254|      0|        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
  |  |  |  |  |  |  255|      0|        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  256|      0|        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
  |  |  |  |  |  |  257|      0|    }
  |  |  |  |  ------------------
  |  |  |  |  263|      0|        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);     \
  |  |  |  |  264|      0|        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);     \
  |  |  |  |  265|      0|        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);     \
  |  |  |  |  266|      0|        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);     \
  |  |  |  |  267|      0|        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);     \
  |  |  |  |  268|      0|        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);     \
  |  |  |  |  269|      0|        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);     \
  |  |  |  |  270|      0|        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);     \
  |  |  |  |  271|      0|        t_##A  = _mm256_xor_si256(                                   \
  |  |  |  |  272|      0|            t_##A, _mm256_loadu_si256((const __m256i*) (m + 0)));   \
  |  |  |  |  273|      0|        t_##B = _mm256_xor_si256(                                    \
  |  |  |  |  274|      0|            t_##B, _mm256_loadu_si256((const __m256i*) (m + 64)));   \
  |  |  |  |  275|      0|        t_##C = _mm256_xor_si256(                                    \
  |  |  |  |  276|      0|            t_##C, _mm256_loadu_si256((const __m256i*) (m + 128)));  \
  |  |  |  |  277|      0|        t_##D = _mm256_xor_si256(                                    \
  |  |  |  |  278|      0|            t_##D, _mm256_loadu_si256((const __m256i*) (m + 192)));  \
  |  |  |  |  279|      0|        t_##A2 = _mm256_xor_si256(                                   \
  |  |  |  |  280|      0|            t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
  |  |  |  |  281|      0|        t_##B2 = _mm256_xor_si256(                                   \
  |  |  |  |  282|      0|            t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
  |  |  |  |  283|      0|        t_##C2 = _mm256_xor_si256(                                   \
  |  |  |  |  284|      0|            t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
  |  |  |  |  285|      0|        t_##D2 = _mm256_xor_si256(                                   \
  |  |  |  |  286|      0|            t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
  |  |  |  |  287|      0|        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);              \
  |  |  |  |  288|      0|        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);             \
  |  |  |  |  289|      0|        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);            \
  |  |  |  |  290|      0|        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);            \
  |  |  |  |  291|      0|        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);           \
  |  |  |  |  292|      0|        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);           \
  |  |  |  |  293|      0|        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);           \
  |  |  |  |  294|      0|        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);           \
  |  |  |  |  295|      0|    }
  |  |  ------------------
  |  |  301|      0|        m -= 32;
  |  |  302|      0|        c -= 32;
  |  |  303|       |
  |  |  304|      0|#undef ONEQUAD
  |  |  305|      0|#undef ONEQUAD_TRANSPOSE
  |  |  306|      0|#undef ONEQUAD_UNPCK
  |  |  307|      0|#undef ONEOCTO
  |  |  308|       |
  |  |  309|      0|        bytes -= 512;
  |  |  310|      0|        c += 512;
  |  |  311|      0|        m += 512;
  |  |  312|      0|    }
  |  |  313|      0|}
  |  |  314|    632|#undef VEC8_ROT
  |  |  315|    632|#undef VEC8_QUARTERROUND
  |  |  316|    632|#undef VEC8_QUARTERROUND_NAIVE
  |  |  317|    632|#undef VEC8_QUARTERROUND_SHUFFLE
  |  |  318|    632|#undef VEC8_QUARTERROUND_SHUFFLE2
  |  |  319|    632|#undef VEC8_LINE1
  |  |  320|    632|#undef VEC8_LINE2
  |  |  321|    632|#undef VEC8_LINE3
  |  |  322|    632|#undef VEC8_LINE4
  |  |  323|    632|#undef VEC8_ROUND
  |  |  324|    632|#undef VEC8_ROUND_SEQ
  |  |  325|    632|#undef VEC8_ROUND_HALF
  |  |  326|    632|#undef VEC8_ROUND_HALFANDHALF
  ------------------
   80|    632|# include "u4.h"
  ------------------
  |  |    1|    632|
  |  |    2|    632|#define VEC4_ROT(A, IMM) \
  |  |    3|    632|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |    4|       |
  |  |    5|       |/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
  |  |    6|       | * 16) (better) */
  |  |    7|    632|#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
  |  |    8|    632|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |    9|    632|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |   10|    632|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |   11|    632|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |   12|    632|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |   13|    632|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |   14|    632|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |   15|    632|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |   16|    632|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |   17|    632|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |   18|    632|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |   19|    632|    x_##B = VEC4_ROT(t_##C, 7)
  |  |   20|       |
  |  |   21|    632|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |   22|       |
  |  |   23|    632|if (bytes >= 256) {
  |  |  ------------------
  |  |  |  Branch (23:5): [True: 0, False: 632]
  |  |  ------------------
  |  |   24|       |    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
  |  |   25|      0|    __m128i rot16 =
  |  |   26|      0|        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
  |  |   27|      0|    __m128i rot8 =
  |  |   28|      0|        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
  |  |   29|       |
  |  |   30|      0|    __m128i x_0  = _mm_set1_epi32(x[0]);
  |  |   31|      0|    __m128i x_1  = _mm_set1_epi32(x[1]);
  |  |   32|      0|    __m128i x_2  = _mm_set1_epi32(x[2]);
  |  |   33|      0|    __m128i x_3  = _mm_set1_epi32(x[3]);
  |  |   34|      0|    __m128i x_4  = _mm_set1_epi32(x[4]);
  |  |   35|      0|    __m128i x_5  = _mm_set1_epi32(x[5]);
  |  |   36|      0|    __m128i x_6  = _mm_set1_epi32(x[6]);
  |  |   37|      0|    __m128i x_7  = _mm_set1_epi32(x[7]);
  |  |   38|      0|    __m128i x_8  = _mm_set1_epi32(x[8]);
  |  |   39|      0|    __m128i x_9  = _mm_set1_epi32(x[9]);
  |  |   40|      0|    __m128i x_10 = _mm_set1_epi32(x[10]);
  |  |   41|      0|    __m128i x_11 = _mm_set1_epi32(x[11]);
  |  |   42|      0|    __m128i x_12;
  |  |   43|      0|    __m128i x_13;
  |  |   44|      0|    __m128i x_14   = _mm_set1_epi32(x[14]);
  |  |   45|      0|    __m128i x_15   = _mm_set1_epi32(x[15]);
  |  |   46|      0|    __m128i orig0  = x_0;
  |  |   47|      0|    __m128i orig1  = x_1;
  |  |   48|      0|    __m128i orig2  = x_2;
  |  |   49|      0|    __m128i orig3  = x_3;
  |  |   50|      0|    __m128i orig4  = x_4;
  |  |   51|      0|    __m128i orig5  = x_5;
  |  |   52|      0|    __m128i orig6  = x_6;
  |  |   53|      0|    __m128i orig7  = x_7;
  |  |   54|      0|    __m128i orig8  = x_8;
  |  |   55|      0|    __m128i orig9  = x_9;
  |  |   56|      0|    __m128i orig10 = x_10;
  |  |   57|      0|    __m128i orig11 = x_11;
  |  |   58|      0|    __m128i orig12;
  |  |   59|      0|    __m128i orig13;
  |  |   60|      0|    __m128i orig14 = x_14;
  |  |   61|      0|    __m128i orig15 = x_15;
  |  |   62|      0|    __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
  |  |   63|      0|        t_13, t_14, t_15;
  |  |   64|       |
  |  |   65|      0|    uint32_t in12, in13;
  |  |   66|      0|    int      i;
  |  |   67|       |
  |  |   68|      0|    while (bytes >= 256) {
  |  |  ------------------
  |  |  |  Branch (68:12): [True: 0, False: 0]
  |  |  ------------------
  |  |   69|      0|        const __m128i addv12 = _mm_set_epi64x(1, 0);
  |  |   70|      0|        const __m128i addv13 = _mm_set_epi64x(3, 2);
  |  |   71|      0|        __m128i       t12, t13;
  |  |   72|      0|        uint64_t      in1213;
  |  |   73|       |
  |  |   74|      0|        x_0  = orig0;
  |  |   75|      0|        x_1  = orig1;
  |  |   76|      0|        x_2  = orig2;
  |  |   77|      0|        x_3  = orig3;
  |  |   78|      0|        x_4  = orig4;
  |  |   79|      0|        x_5  = orig5;
  |  |   80|      0|        x_6  = orig6;
  |  |   81|      0|        x_7  = orig7;
  |  |   82|      0|        x_8  = orig8;
  |  |   83|      0|        x_9  = orig9;
  |  |   84|      0|        x_10 = orig10;
  |  |   85|      0|        x_11 = orig11;
  |  |   86|      0|        x_14 = orig14;
  |  |   87|      0|        x_15 = orig15;
  |  |   88|       |
  |  |   89|      0|        in12   = x[12];
  |  |   90|      0|        in13   = x[13];
  |  |   91|      0|        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
  |  |   92|      0|        t12    = _mm_set1_epi64x(in1213);
  |  |   93|      0|        t13    = _mm_set1_epi64x(in1213);
  |  |   94|       |
  |  |   95|      0|        x_12 = _mm_add_epi64(addv12, t12);
  |  |   96|      0|        x_13 = _mm_add_epi64(addv13, t13);
  |  |   97|       |
  |  |   98|      0|        t12 = _mm_unpacklo_epi32(x_12, x_13);
  |  |   99|      0|        t13 = _mm_unpackhi_epi32(x_12, x_13);
  |  |  100|       |
  |  |  101|      0|        x_12 = _mm_unpacklo_epi32(t12, t13);
  |  |  102|      0|        x_13 = _mm_unpackhi_epi32(t12, t13);
  |  |  103|       |
  |  |  104|      0|        orig12 = x_12;
  |  |  105|      0|        orig13 = x_13;
  |  |  106|       |
  |  |  107|      0|        in1213 += 4;
  |  |  108|       |
  |  |  109|      0|        x[12] = in1213 & 0xFFFFFFFF;
  |  |  110|      0|        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
  |  |  111|       |
  |  |  112|      0|        for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   29|      0|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (112:21): [True: 0, False: 0]
  |  |  ------------------
  |  |  113|      0|            VEC4_QUARTERROUND(0, 4, 8, 12);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  114|      0|            VEC4_QUARTERROUND(1, 5, 9, 13);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  115|      0|            VEC4_QUARTERROUND(2, 6, 10, 14);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  116|      0|            VEC4_QUARTERROUND(3, 7, 11, 15);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  117|      0|            VEC4_QUARTERROUND(0, 5, 10, 15);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  118|      0|            VEC4_QUARTERROUND(1, 6, 11, 12);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  119|      0|            VEC4_QUARTERROUND(2, 7, 8, 13);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  120|      0|            VEC4_QUARTERROUND(3, 4, 9, 14);
  |  |  ------------------
  |  |  |  |   21|      0|#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |    8|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |    9|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   10|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
  |  |  |  |  |  |   11|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   12|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   13|      0|    x_##B = VEC4_ROT(t_##C, 12);              \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   14|      0|    x_##A = _mm_add_epi32(x_##A, x_##B);      \
  |  |  |  |  |  |   15|      0|    t_##A = _mm_xor_si128(x_##D, x_##A);      \
  |  |  |  |  |  |   16|      0|    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
  |  |  |  |  |  |   17|      0|    x_##C = _mm_add_epi32(x_##C, x_##D);      \
  |  |  |  |  |  |   18|      0|    t_##C = _mm_xor_si128(x_##B, x_##C);      \
  |  |  |  |  |  |   19|      0|    x_##B = VEC4_ROT(t_##C, 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |    3|      0|    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  121|      0|        }
  |  |  122|       |
  |  |  123|      0|#define ONEQUAD_TRANSPOSE(A, B, C, D)                                          \
  |  |  124|      0|    {                                                                          \
  |  |  125|      0|        __m128i t0, t1, t2, t3;                                                \
  |  |  126|      0|                                                                               \
  |  |  127|      0|        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
  |  |  128|      0|        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
  |  |  129|      0|        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
  |  |  130|      0|        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
  |  |  131|      0|        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
  |  |  132|      0|        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
  |  |  133|      0|        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
  |  |  134|      0|        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
  |  |  135|      0|        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
  |  |  136|      0|        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
  |  |  137|      0|        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
  |  |  138|      0|        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
  |  |  139|      0|                                                                               \
  |  |  140|      0|        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
  |  |  141|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
  |  |  142|      0|        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
  |  |  143|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
  |  |  144|      0|        t2 =                                                                   \
  |  |  145|      0|            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  146|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
  |  |  147|      0|        t3 =                                                                   \
  |  |  148|      0|            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  149|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
  |  |  150|      0|    }
  |  |  151|       |
  |  |  152|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  153|       |
  |  |  154|      0|        ONEQUAD(0, 1, 2, 3);
  |  |  ------------------
  |  |  |  |  152|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  124|      0|    {                                                                          \
  |  |  |  |  |  |  125|      0|        __m128i t0, t1, t2, t3;                                                \
  |  |  |  |  |  |  126|      0|                                                                               \
  |  |  |  |  |  |  127|      0|        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
  |  |  |  |  |  |  128|      0|        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
  |  |  |  |  |  |  129|      0|        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
  |  |  |  |  |  |  130|      0|        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
  |  |  |  |  |  |  131|      0|        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  132|      0|        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  133|      0|        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  134|      0|        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  135|      0|        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  136|      0|        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  137|      0|        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  138|      0|        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  139|      0|                                                                               \
  |  |  |  |  |  |  140|      0|        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
  |  |  |  |  |  |  141|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
  |  |  |  |  |  |  142|      0|        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
  |  |  |  |  |  |  143|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
  |  |  |  |  |  |  144|      0|        t2 =                                                                   \
  |  |  |  |  |  |  145|      0|            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  |  |  |  |  146|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
  |  |  |  |  |  |  147|      0|        t3 =                                                                   \
  |  |  |  |  |  |  148|      0|            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  |  |  |  |  149|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
  |  |  |  |  |  |  150|      0|    }
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  155|      0|        m += 16;
  |  |  156|      0|        c += 16;
  |  |  157|      0|        ONEQUAD(4, 5, 6, 7);
  |  |  ------------------
  |  |  |  |  152|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  124|      0|    {                                                                          \
  |  |  |  |  |  |  125|      0|        __m128i t0, t1, t2, t3;                                                \
  |  |  |  |  |  |  126|      0|                                                                               \
  |  |  |  |  |  |  127|      0|        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
  |  |  |  |  |  |  128|      0|        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
  |  |  |  |  |  |  129|      0|        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
  |  |  |  |  |  |  130|      0|        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
  |  |  |  |  |  |  131|      0|        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  132|      0|        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  133|      0|        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  134|      0|        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  135|      0|        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  136|      0|        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  137|      0|        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  138|      0|        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  139|      0|                                                                               \
  |  |  |  |  |  |  140|      0|        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
  |  |  |  |  |  |  141|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
  |  |  |  |  |  |  142|      0|        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
  |  |  |  |  |  |  143|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
  |  |  |  |  |  |  144|      0|        t2 =                                                                   \
  |  |  |  |  |  |  145|      0|            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  |  |  |  |  146|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
  |  |  |  |  |  |  147|      0|        t3 =                                                                   \
  |  |  |  |  |  |  148|      0|            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  |  |  |  |  149|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
  |  |  |  |  |  |  150|      0|    }
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  158|      0|        m += 16;
  |  |  159|      0|        c += 16;
  |  |  160|      0|        ONEQUAD(8, 9, 10, 11);
  |  |  ------------------
  |  |  |  |  152|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  124|      0|    {                                                                          \
  |  |  |  |  |  |  125|      0|        __m128i t0, t1, t2, t3;                                                \
  |  |  |  |  |  |  126|      0|                                                                               \
  |  |  |  |  |  |  127|      0|        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
  |  |  |  |  |  |  128|      0|        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
  |  |  |  |  |  |  129|      0|        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
  |  |  |  |  |  |  130|      0|        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
  |  |  |  |  |  |  131|      0|        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  132|      0|        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  133|      0|        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  134|      0|        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  135|      0|        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  136|      0|        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  137|      0|        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  138|      0|        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  139|      0|                                                                               \
  |  |  |  |  |  |  140|      0|        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
  |  |  |  |  |  |  141|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
  |  |  |  |  |  |  142|      0|        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
  |  |  |  |  |  |  143|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
  |  |  |  |  |  |  144|      0|        t2 =                                                                   \
  |  |  |  |  |  |  145|      0|            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  |  |  |  |  146|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
  |  |  |  |  |  |  147|      0|        t3 =                                                                   \
  |  |  |  |  |  |  148|      0|            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  |  |  |  |  149|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
  |  |  |  |  |  |  150|      0|    }
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  161|      0|        m += 16;
  |  |  162|      0|        c += 16;
  |  |  163|      0|        ONEQUAD(12, 13, 14, 15);
  |  |  ------------------
  |  |  |  |  152|      0|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  124|      0|    {                                                                          \
  |  |  |  |  |  |  125|      0|        __m128i t0, t1, t2, t3;                                                \
  |  |  |  |  |  |  126|      0|                                                                               \
  |  |  |  |  |  |  127|      0|        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
  |  |  |  |  |  |  128|      0|        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
  |  |  |  |  |  |  129|      0|        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
  |  |  |  |  |  |  130|      0|        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
  |  |  |  |  |  |  131|      0|        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  132|      0|        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  133|      0|        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
  |  |  |  |  |  |  134|      0|        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
  |  |  |  |  |  |  135|      0|        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  136|      0|        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
  |  |  |  |  |  |  137|      0|        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  138|      0|        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
  |  |  |  |  |  |  139|      0|                                                                               \
  |  |  |  |  |  |  140|      0|        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
  |  |  |  |  |  |  141|      0|        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
  |  |  |  |  |  |  142|      0|        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
  |  |  |  |  |  |  143|      0|        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
  |  |  |  |  |  |  144|      0|        t2 =                                                                   \
  |  |  |  |  |  |  145|      0|            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  |  |  |  |  146|      0|        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
  |  |  |  |  |  |  147|      0|        t3 =                                                                   \
  |  |  |  |  |  |  148|      0|            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  |  |  |  |  149|      0|        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
  |  |  |  |  |  |  150|      0|    }
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  164|      0|        m -= 48;
  |  |  165|      0|        c -= 48;
  |  |  166|       |
  |  |  167|      0|#undef ONEQUAD
  |  |  168|      0|#undef ONEQUAD_TRANSPOSE
  |  |  169|       |
  |  |  170|      0|        bytes -= 256;
  |  |  171|      0|        c += 256;
  |  |  172|      0|        m += 256;
  |  |  173|      0|    }
  |  |  174|      0|}
  |  |  175|    632|#undef VEC4_ROT
  |  |  176|    632|#undef VEC4_QUARTERROUND
  |  |  177|    632|#undef VEC4_QUARTERROUND_SHUFFLE
  ------------------
   81|    632|# include "u1.h"
  ------------------
  |  |    1|    632|while (bytes >= 64) {
  |  |  ------------------
  |  |  |  Branch (1:8): [True: 0, False: 632]
  |  |  ------------------
  |  |    2|      0|    __m128i       x_0, x_1, x_2, x_3;
  |  |    3|      0|    __m128i       t_1;
  |  |    4|      0|    const __m128i rot16 =
  |  |    5|      0|        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
  |  |    6|      0|    const __m128i rot8 =
  |  |    7|      0|        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
  |  |    8|       |
  |  |    9|      0|    uint32_t in12;
  |  |   10|      0|    uint32_t in13;
  |  |   11|      0|    int      i;
  |  |   12|       |
  |  |   13|      0|    x_0 = _mm_loadu_si128((const __m128i*) (x + 0));
  |  |   14|      0|    x_1 = _mm_loadu_si128((const __m128i*) (x + 4));
  |  |   15|      0|    x_2 = _mm_loadu_si128((const __m128i*) (x + 8));
  |  |   16|      0|    x_3 = _mm_loadu_si128((const __m128i*) (x + 12));
  |  |   17|       |
  |  |   18|      0|    for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   29|      0|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (18:17): [True: 0, False: 0]
  |  |  ------------------
  |  |   19|      0|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   20|      0|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   21|      0|        x_3 = _mm_shuffle_epi8(x_3, rot16);
  |  |   22|       |
  |  |   23|      0|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   24|      0|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   25|       |
  |  |   26|      0|        t_1 = x_1;
  |  |   27|      0|        x_1 = _mm_slli_epi32(x_1, 12);
  |  |   28|      0|        t_1 = _mm_srli_epi32(t_1, 20);
  |  |   29|      0|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   30|       |
  |  |   31|      0|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   32|      0|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   33|      0|        x_0 = _mm_shuffle_epi32(x_0, 0x93);
  |  |   34|      0|        x_3 = _mm_shuffle_epi8(x_3, rot8);
  |  |   35|       |
  |  |   36|      0|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   37|      0|        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  |  |   38|      0|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   39|      0|        x_2 = _mm_shuffle_epi32(x_2, 0x39);
  |  |   40|       |
  |  |   41|      0|        t_1 = x_1;
  |  |   42|      0|        x_1 = _mm_slli_epi32(x_1, 7);
  |  |   43|      0|        t_1 = _mm_srli_epi32(t_1, 25);
  |  |   44|      0|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   45|       |
  |  |   46|      0|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   47|      0|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   48|      0|        x_3 = _mm_shuffle_epi8(x_3, rot16);
  |  |   49|       |
  |  |   50|      0|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   51|      0|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   52|       |
  |  |   53|      0|        t_1 = x_1;
  |  |   54|      0|        x_1 = _mm_slli_epi32(x_1, 12);
  |  |   55|      0|        t_1 = _mm_srli_epi32(t_1, 20);
  |  |   56|      0|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   57|       |
  |  |   58|      0|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   59|      0|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   60|      0|        x_0 = _mm_shuffle_epi32(x_0, 0x39);
  |  |   61|      0|        x_3 = _mm_shuffle_epi8(x_3, rot8);
  |  |   62|       |
  |  |   63|      0|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   64|      0|        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  |  |   65|      0|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   66|      0|        x_2 = _mm_shuffle_epi32(x_2, 0x93);
  |  |   67|       |
  |  |   68|      0|        t_1 = x_1;
  |  |   69|      0|        x_1 = _mm_slli_epi32(x_1, 7);
  |  |   70|      0|        t_1 = _mm_srli_epi32(t_1, 25);
  |  |   71|      0|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   72|      0|    }
  |  |   73|      0|    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0)));
  |  |   74|      0|    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4)));
  |  |   75|      0|    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8)));
  |  |   76|      0|    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12)));
  |  |   77|      0|    x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i*) (m + 0)));
  |  |   78|      0|    x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i*) (m + 16)));
  |  |   79|      0|    x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i*) (m + 32)));
  |  |   80|      0|    x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i*) (m + 48)));
  |  |   81|      0|    _mm_storeu_si128((__m128i*) (c + 0), x_0);
  |  |   82|      0|    _mm_storeu_si128((__m128i*) (c + 16), x_1);
  |  |   83|      0|    _mm_storeu_si128((__m128i*) (c + 32), x_2);
  |  |   84|      0|    _mm_storeu_si128((__m128i*) (c + 48), x_3);
  |  |   85|       |
  |  |   86|      0|    in12 = x[12];
  |  |   87|      0|    in13 = x[13];
  |  |   88|      0|    in12++;
  |  |   89|      0|    if (in12 == 0) {
  |  |  ------------------
  |  |  |  Branch (89:9): [True: 0, False: 0]
  |  |  ------------------
  |  |   90|      0|        in13++;
  |  |   91|      0|    }
  |  |   92|      0|    x[12] = in12;
  |  |   93|      0|    x[13] = in13;
  |  |   94|       |
  |  |   95|      0|    bytes -= 64;
  |  |   96|      0|    c += 64;
  |  |   97|      0|    m += 64;
  |  |   98|      0|}
  ------------------
   82|    632|# include "u0.h"
  ------------------
  |  |    1|    632|if (bytes > 0) {
  |  |  ------------------
  |  |  |  Branch (1:5): [True: 632, False: 0]
  |  |  ------------------
  |  |    2|    632|    __m128i       x_0, x_1, x_2, x_3;
  |  |    3|    632|    __m128i       t_1;
  |  |    4|    632|    const __m128i rot16 =
  |  |    5|    632|        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
  |  |    6|    632|    const __m128i rot8 =
  |  |    7|    632|        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
  |  |    8|    632|    uint8_t partialblock[64];
  |  |    9|       |
  |  |   10|    632|    unsigned int i;
  |  |   11|       |
  |  |   12|    632|    x_0 = _mm_loadu_si128((const __m128i*) (x + 0));
  |  |   13|    632|    x_1 = _mm_loadu_si128((const __m128i*) (x + 4));
  |  |   14|    632|    x_2 = _mm_loadu_si128((const __m128i*) (x + 8));
  |  |   15|    632|    x_3 = _mm_loadu_si128((const __m128i*) (x + 12));
  |  |   16|       |
  |  |   17|  6.95k|    for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   29|  6.95k|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (17:17): [True: 6.32k, False: 632]
  |  |  ------------------
  |  |   18|  6.32k|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   19|  6.32k|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   20|  6.32k|        x_3 = _mm_shuffle_epi8(x_3, rot16);
  |  |   21|       |
  |  |   22|  6.32k|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   23|  6.32k|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   24|       |
  |  |   25|  6.32k|        t_1 = x_1;
  |  |   26|  6.32k|        x_1 = _mm_slli_epi32(x_1, 12);
  |  |   27|  6.32k|        t_1 = _mm_srli_epi32(t_1, 20);
  |  |   28|  6.32k|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   29|       |
  |  |   30|  6.32k|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   31|  6.32k|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   32|  6.32k|        x_0 = _mm_shuffle_epi32(x_0, 0x93);
  |  |   33|  6.32k|        x_3 = _mm_shuffle_epi8(x_3, rot8);
  |  |   34|       |
  |  |   35|  6.32k|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   36|  6.32k|        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  |  |   37|  6.32k|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   38|  6.32k|        x_2 = _mm_shuffle_epi32(x_2, 0x39);
  |  |   39|       |
  |  |   40|  6.32k|        t_1 = x_1;
  |  |   41|  6.32k|        x_1 = _mm_slli_epi32(x_1, 7);
  |  |   42|  6.32k|        t_1 = _mm_srli_epi32(t_1, 25);
  |  |   43|  6.32k|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   44|       |
  |  |   45|  6.32k|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   46|  6.32k|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   47|  6.32k|        x_3 = _mm_shuffle_epi8(x_3, rot16);
  |  |   48|       |
  |  |   49|  6.32k|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   50|  6.32k|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   51|       |
  |  |   52|  6.32k|        t_1 = x_1;
  |  |   53|  6.32k|        x_1 = _mm_slli_epi32(x_1, 12);
  |  |   54|  6.32k|        t_1 = _mm_srli_epi32(t_1, 20);
  |  |   55|  6.32k|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   56|       |
  |  |   57|  6.32k|        x_0 = _mm_add_epi32(x_0, x_1);
  |  |   58|  6.32k|        x_3 = _mm_xor_si128(x_3, x_0);
  |  |   59|  6.32k|        x_0 = _mm_shuffle_epi32(x_0, 0x39);
  |  |   60|  6.32k|        x_3 = _mm_shuffle_epi8(x_3, rot8);
  |  |   61|       |
  |  |   62|  6.32k|        x_2 = _mm_add_epi32(x_2, x_3);
  |  |   63|  6.32k|        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  |  |   64|  6.32k|        x_1 = _mm_xor_si128(x_1, x_2);
  |  |   65|  6.32k|        x_2 = _mm_shuffle_epi32(x_2, 0x93);
  |  |   66|       |
  |  |   67|  6.32k|        t_1 = x_1;
  |  |   68|  6.32k|        x_1 = _mm_slli_epi32(x_1, 7);
  |  |   69|  6.32k|        t_1 = _mm_srli_epi32(t_1, 25);
  |  |   70|  6.32k|        x_1 = _mm_xor_si128(x_1, t_1);
  |  |   71|  6.32k|    }
  |  |   72|    632|    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0)));
  |  |   73|    632|    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4)));
  |  |   74|    632|    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8)));
  |  |   75|    632|    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12)));
  |  |   76|    632|    _mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
  |  |   77|    632|    _mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
  |  |   78|    632|    _mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
  |  |   79|    632|    _mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
  |  |   80|       |
  |  |   81|  18.3k|    for (i = 0; i < bytes; i++) {
  |  |  ------------------
  |  |  |  Branch (81:17): [True: 17.6k, False: 632]
  |  |  ------------------
  |  |   82|  17.6k|        c[i] = m[i] ^ partialblock[i];
  |  |   83|  17.6k|    }
  |  |   84|       |
  |  |   85|    632|    sodium_memzero(partialblock, sizeof partialblock);
  |  |   86|    632|}
  ------------------
   83|    632|}
chacha20_dolbeau-avx2.c:stream_ietf_ext_ref:
  107|    632|{
  108|    632|    struct chacha_ctx ctx;
  109|       |
  110|    632|    if (!clen) {
  ------------------
  |  Branch (110:9): [True: 0, False: 632]
  ------------------
  111|      0|        return 0;
  112|      0|    }
  113|    632|    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
  ------------------
  |  |   23|    632|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
  114|    632|    chacha_keysetup(&ctx, k);
  115|    632|    chacha_ietf_ivsetup(&ctx, n, NULL);
  116|    632|    memset(c, 0, clen);
  117|    632|    chacha20_encrypt_bytes(&ctx, c, c, clen);
  118|    632|    sodium_memzero(&ctx, sizeof ctx);
  119|       |
  120|    632|    return 0;
  121|    632|}
chacha20_dolbeau-avx2.c:chacha_ietf_ivsetup:
   63|    632|{
   64|    632|    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
  |  Branch (64:22): [True: 632, False: 0]
  ------------------
   65|    632|    ctx->input[13] = LOAD32_LE(iv + 0);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   66|    632|    ctx->input[14] = LOAD32_LE(iv + 4);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   67|    632|    ctx->input[15] = LOAD32_LE(iv + 8);
  ------------------
  |  |  111|    632|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   68|    632|}

crypto_stream_chacha20_ietf_ext:
   90|    632|{
   91|    632|    if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
  ------------------
  |  |   31|    632|#define crypto_stream_chacha20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
  |  |  ------------------
  |  |  |  |   55|    632|#define SODIUM_SIZE_MAX SODIUM_MIN(UINT64_MAX, SIZE_MAX)
  |  |  |  |  ------------------
  |  |  |  |  |  |   54|    632|#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (54:27): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (91:9): [True: 0, False: 632]
  ------------------
   92|      0|        sodium_misuse();
   93|      0|    }
   94|    632|    return implementation->stream_ietf_ext(c, clen, n, k);
   95|    632|}
crypto_stream_chacha20_ietf:
  123|    632|{
  124|    632|    if (clen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) {
  ------------------
  |  |   70|    632|    SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
  |  |  ------------------
  |  |  |  |   54|  1.26k|#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (54:27): [Folded - Ignored]
  |  |  |  |  |  Branch (54:28): [Folded - Ignored]
  |  |  |  |  |  Branch (54:40): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (124:9): [True: 0, False: 632]
  ------------------
  125|      0|        sodium_misuse();
  126|      0|    }
  127|    632|    return crypto_stream_chacha20_ietf_ext(c, clen, n, k);
  128|    632|}
_crypto_stream_chacha20_pick_best_implementation:
  168|      1|{
  169|      1|    implementation = &crypto_stream_chacha20_ref_implementation;
  170|      1|#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
  171|      1|    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
  172|      1|    if (sodium_runtime_has_avx2()) {
  ------------------
  |  Branch (172:9): [True: 1, False: 0]
  ------------------
  173|      1|        implementation = &crypto_stream_chacha20_dolbeau_avx2_implementation;
  174|      1|        return 0;
  175|      1|    }
  176|      0|#endif
  177|      0|#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
  178|      0|    if (sodium_runtime_has_ssse3()) {
  ------------------
  |  Branch (178:9): [True: 0, False: 0]
  ------------------
  179|      0|        implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation;
  180|      0|        return 0;
  181|      0|    }
  182|      0|#endif
  183|      0|    return 0;
  184|      0|}

crypto_stream_salsa20:
   50|    316|{
   51|    316|    return implementation->stream(c, clen, n, k);
   52|    316|}
crypto_stream_salsa20_xor_ic:
   59|    484|{
   60|    484|    return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
   61|    484|}
crypto_stream_salsa20_xor:
   67|    632|{
   68|    632|    return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
   69|    632|}
_crypto_stream_salsa20_pick_best_implementation:
   79|      1|{
   80|      1|#ifdef HAVE_AMD64_ASM
   81|      1|    implementation = &crypto_stream_salsa20_xmm6_implementation;
   82|       |#else
   83|       |    implementation = &crypto_stream_salsa20_ref_implementation;
   84|       |#endif
   85|       |
   86|      1|#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
   87|      1|    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
   88|      1|    if (sodium_runtime_has_avx2()) {
  ------------------
  |  Branch (88:9): [True: 1, False: 0]
  ------------------
   89|      1|        implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation;
   90|      1|        return 0;
   91|      1|    }
   92|      0|#endif
   93|       |#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
   94|       |    if (sodium_runtime_has_sse2()) {
   95|       |        implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
   96|       |        return 0;
   97|       |    }
   98|       |#endif
   99|      0|    return 0; /* LCOV_EXCL_LINE */
  100|      1|}

salsa20_xmm6int-avx2.c:stream_avx2:
   83|    316|{
   84|    316|    struct salsa_ctx ctx;
   85|       |
   86|    316|    if (!clen) {
  ------------------
  |  Branch (86:9): [True: 0, False: 316]
  ------------------
   87|      0|        return 0;
   88|      0|    }
   89|    316|    COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
  ------------------
  |  |   23|    316|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
   90|    316|    salsa_keysetup(&ctx, k);
   91|    316|    salsa_ivsetup(&ctx, n, NULL);
   92|    316|    memset(c, 0, clen);
   93|    316|    salsa20_encrypt_bytes(&ctx, c, c, clen);
   94|    316|    sodium_memzero(&ctx, sizeof ctx);
   95|       |
   96|    316|    return 0;
   97|    316|}
salsa20_xmm6int-avx2.c:salsa_keysetup:
   40|  1.43k|{
   41|  1.43k|    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   42|  1.43k|    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   43|  1.43k|    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   44|  1.43k|    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   45|  1.43k|    ctx->input[TR[11]] = LOAD32_LE(k + 16);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   46|  1.43k|    ctx->input[TR[12]] = LOAD32_LE(k + 20);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   47|  1.43k|    ctx->input[TR[13]] = LOAD32_LE(k + 24);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   48|  1.43k|    ctx->input[TR[14]] = LOAD32_LE(k + 28);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   49|  1.43k|    ctx->input[TR[0]]  = 0x61707865;
   50|  1.43k|    ctx->input[TR[5]]  = 0x3320646e;
   51|  1.43k|    ctx->input[TR[10]] = 0x79622d32;
   52|  1.43k|    ctx->input[TR[15]] = 0x6b206574;
   53|  1.43k|}
salsa20_xmm6int-avx2.c:salsa_ivsetup:
   57|  1.43k|{
   58|  1.43k|    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   59|  1.43k|    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
  ------------------
  |  |  111|  1.43k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
   60|  1.43k|    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
  ------------------
  |  |  111|  2.54k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
  |  Branch (60:25): [True: 316, False: 1.11k]
  ------------------
   61|  1.43k|    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
  ------------------
  |  |  111|  2.54k|#define LOAD32_LE(SRC) load32_le(SRC)
  ------------------
  |  Branch (61:25): [True: 316, False: 1.11k]
  ------------------
   62|  1.43k|}
salsa20_xmm6int-avx2.c:salsa20_encrypt_bytes:
   67|  1.43k|{
   68|  1.43k|    uint32_t * const x = &ctx->input[0];
   69|       |
   70|  1.43k|    if (!bytes) {
  ------------------
  |  Branch (70:9): [True: 0, False: 1.43k]
  ------------------
   71|      0|        return; /* LCOV_EXCL_LINE */
   72|      0|    }
   73|       |
   74|  1.43k|#include "u8.h"
  ------------------
  |  |    1|  1.43k|if (bytes >= 512) {
  |  |  ------------------
  |  |  |  Branch (1:5): [True: 368, False: 1.06k]
  |  |  ------------------
  |  |    2|    368|    __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
  |  |    3|    368|        y15;
  |  |    4|       |
  |  |    5|       |    /* the naive way seems as fast (if not a bit faster) than the vector way */
  |  |    6|    368|    __m256i z0  = _mm256_set1_epi32(x[0]);
  |  |    7|    368|    __m256i z5  = _mm256_set1_epi32(x[1]);
  |  |    8|    368|    __m256i z10 = _mm256_set1_epi32(x[2]);
  |  |    9|    368|    __m256i z15 = _mm256_set1_epi32(x[3]);
  |  |   10|    368|    __m256i z12 = _mm256_set1_epi32(x[4]);
  |  |   11|    368|    __m256i z1  = _mm256_set1_epi32(x[5]);
  |  |   12|    368|    __m256i z6  = _mm256_set1_epi32(x[6]);
  |  |   13|    368|    __m256i z11 = _mm256_set1_epi32(x[7]);
  |  |   14|    368|    __m256i z8; /* useless */
  |  |   15|    368|    __m256i z13 = _mm256_set1_epi32(x[9]);
  |  |   16|    368|    __m256i z2  = _mm256_set1_epi32(x[10]);
  |  |   17|    368|    __m256i z7  = _mm256_set1_epi32(x[11]);
  |  |   18|    368|    __m256i z4  = _mm256_set1_epi32(x[12]);
  |  |   19|    368|    __m256i z9; /* useless */
  |  |   20|    368|    __m256i z14 = _mm256_set1_epi32(x[14]);
  |  |   21|    368|    __m256i z3  = _mm256_set1_epi32(x[15]);
  |  |   22|       |
  |  |   23|    368|    __m256i orig0 = z0;
  |  |   24|    368|    __m256i orig1 = z1;
  |  |   25|    368|    __m256i orig2 = z2;
  |  |   26|    368|    __m256i orig3 = z3;
  |  |   27|    368|    __m256i orig4 = z4;
  |  |   28|    368|    __m256i orig5 = z5;
  |  |   29|    368|    __m256i orig6 = z6;
  |  |   30|    368|    __m256i orig7 = z7;
  |  |   31|    368|    __m256i orig8;
  |  |   32|    368|    __m256i orig9;
  |  |   33|    368|    __m256i orig10 = z10;
  |  |   34|    368|    __m256i orig11 = z11;
  |  |   35|    368|    __m256i orig12 = z12;
  |  |   36|    368|    __m256i orig13 = z13;
  |  |   37|    368|    __m256i orig14 = z14;
  |  |   38|    368|    __m256i orig15 = z15;
  |  |   39|       |
  |  |   40|    368|    uint32_t in8;
  |  |   41|    368|    uint32_t in9;
  |  |   42|    368|    int      i;
  |  |   43|       |
  |  |   44|   178k|    while (bytes >= 512) {
  |  |  ------------------
  |  |  |  Branch (44:12): [True: 178k, False: 368]
  |  |  ------------------
  |  |   45|       |        /* vector implementation for z8 and z9 */
  |  |   46|       |        /* faster than the naive version for 8 blocks */
  |  |   47|   178k|        const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
  |  |   48|   178k|        const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
  |  |   49|   178k|        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
  |  |   50|       |
  |  |   51|   178k|        __m256i  t8, t9;
  |  |   52|   178k|        uint64_t in89;
  |  |   53|       |
  |  |   54|   178k|        in8  = x[8];
  |  |   55|   178k|        in9  = x[13]; /* see arrays above for the address translation */
  |  |   56|   178k|        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
  |  |   57|       |
  |  |   58|   178k|        z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
  |  |   59|       |
  |  |   60|   178k|        t8 = _mm256_add_epi64(addv8, z8);
  |  |   61|   178k|        t9 = _mm256_add_epi64(addv9, z9);
  |  |   62|       |
  |  |   63|   178k|        z8 = _mm256_unpacklo_epi32(t8, t9);
  |  |   64|   178k|        z9 = _mm256_unpackhi_epi32(t8, t9);
  |  |   65|       |
  |  |   66|   178k|        t8 = _mm256_unpacklo_epi32(z8, z9);
  |  |   67|   178k|        t9 = _mm256_unpackhi_epi32(z8, z9);
  |  |   68|       |
  |  |   69|       |        /* required because unpack* are intra-lane */
  |  |   70|   178k|        z8 = _mm256_permutevar8x32_epi32(t8, permute);
  |  |   71|   178k|        z9 = _mm256_permutevar8x32_epi32(t9, permute);
  |  |   72|       |
  |  |   73|   178k|        orig8 = z8;
  |  |   74|   178k|        orig9 = z9;
  |  |   75|       |
  |  |   76|   178k|        in89 += 8;
  |  |   77|       |
  |  |   78|   178k|        x[8]  = in89 & 0xFFFFFFFF;
  |  |   79|   178k|        x[13] = (in89 >> 32) & 0xFFFFFFFF;
  |  |   80|       |
  |  |   81|   178k|        z5  = orig5;
  |  |   82|   178k|        z10 = orig10;
  |  |   83|   178k|        z15 = orig15;
  |  |   84|   178k|        z14 = orig14;
  |  |   85|   178k|        z3  = orig3;
  |  |   86|   178k|        z6  = orig6;
  |  |   87|   178k|        z11 = orig11;
  |  |   88|   178k|        z1  = orig1;
  |  |   89|       |
  |  |   90|   178k|        z7  = orig7;
  |  |   91|   178k|        z13 = orig13;
  |  |   92|   178k|        z2  = orig2;
  |  |   93|   178k|        z9  = orig9;
  |  |   94|   178k|        z0  = orig0;
  |  |   95|   178k|        z12 = orig12;
  |  |   96|   178k|        z4  = orig4;
  |  |   97|   178k|        z8  = orig8;
  |  |   98|       |
  |  |   99|  1.95M|        for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   28|  1.95M|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (99:21): [True: 1.78M, False: 178k]
  |  |  ------------------
  |  |  100|       |            /* the inner loop is a direct translation (regexp search/replace)
  |  |  101|       |             * from the amd64-xmm6 ASM */
  |  |  102|  1.78M|            __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
  |  |  103|  1.78M|                r14, r15;
  |  |  104|       |
  |  |  105|  1.78M|            y4 = z12;
  |  |  106|  1.78M|            y4 = _mm256_add_epi32(y4, z0);
  |  |  107|  1.78M|            r4 = y4;
  |  |  108|  1.78M|            y4 = _mm256_slli_epi32(y4, 7);
  |  |  109|  1.78M|            z4 = _mm256_xor_si256(z4, y4);
  |  |  110|  1.78M|            r4 = _mm256_srli_epi32(r4, 25);
  |  |  111|  1.78M|            z4 = _mm256_xor_si256(z4, r4);
  |  |  112|       |
  |  |  113|  1.78M|            y9 = z1;
  |  |  114|  1.78M|            y9 = _mm256_add_epi32(y9, z5);
  |  |  115|  1.78M|            r9 = y9;
  |  |  116|  1.78M|            y9 = _mm256_slli_epi32(y9, 7);
  |  |  117|  1.78M|            z9 = _mm256_xor_si256(z9, y9);
  |  |  118|  1.78M|            r9 = _mm256_srli_epi32(r9, 25);
  |  |  119|  1.78M|            z9 = _mm256_xor_si256(z9, r9);
  |  |  120|       |
  |  |  121|  1.78M|            y8 = z0;
  |  |  122|  1.78M|            y8 = _mm256_add_epi32(y8, z4);
  |  |  123|  1.78M|            r8 = y8;
  |  |  124|  1.78M|            y8 = _mm256_slli_epi32(y8, 9);
  |  |  125|  1.78M|            z8 = _mm256_xor_si256(z8, y8);
  |  |  126|  1.78M|            r8 = _mm256_srli_epi32(r8, 23);
  |  |  127|  1.78M|            z8 = _mm256_xor_si256(z8, r8);
  |  |  128|       |
  |  |  129|  1.78M|            y13 = z5;
  |  |  130|  1.78M|            y13 = _mm256_add_epi32(y13, z9);
  |  |  131|  1.78M|            r13 = y13;
  |  |  132|  1.78M|            y13 = _mm256_slli_epi32(y13, 9);
  |  |  133|  1.78M|            z13 = _mm256_xor_si256(z13, y13);
  |  |  134|  1.78M|            r13 = _mm256_srli_epi32(r13, 23);
  |  |  135|  1.78M|            z13 = _mm256_xor_si256(z13, r13);
  |  |  136|       |
  |  |  137|  1.78M|            y12 = z4;
  |  |  138|  1.78M|            y12 = _mm256_add_epi32(y12, z8);
  |  |  139|  1.78M|            r12 = y12;
  |  |  140|  1.78M|            y12 = _mm256_slli_epi32(y12, 13);
  |  |  141|  1.78M|            z12 = _mm256_xor_si256(z12, y12);
  |  |  142|  1.78M|            r12 = _mm256_srli_epi32(r12, 19);
  |  |  143|  1.78M|            z12 = _mm256_xor_si256(z12, r12);
  |  |  144|       |
  |  |  145|  1.78M|            y1 = z9;
  |  |  146|  1.78M|            y1 = _mm256_add_epi32(y1, z13);
  |  |  147|  1.78M|            r1 = y1;
  |  |  148|  1.78M|            y1 = _mm256_slli_epi32(y1, 13);
  |  |  149|  1.78M|            z1 = _mm256_xor_si256(z1, y1);
  |  |  150|  1.78M|            r1 = _mm256_srli_epi32(r1, 19);
  |  |  151|  1.78M|            z1 = _mm256_xor_si256(z1, r1);
  |  |  152|       |
  |  |  153|  1.78M|            y0 = z8;
  |  |  154|  1.78M|            y0 = _mm256_add_epi32(y0, z12);
  |  |  155|  1.78M|            r0 = y0;
  |  |  156|  1.78M|            y0 = _mm256_slli_epi32(y0, 18);
  |  |  157|  1.78M|            z0 = _mm256_xor_si256(z0, y0);
  |  |  158|  1.78M|            r0 = _mm256_srli_epi32(r0, 14);
  |  |  159|  1.78M|            z0 = _mm256_xor_si256(z0, r0);
  |  |  160|       |
  |  |  161|  1.78M|            y5 = z13;
  |  |  162|  1.78M|            y5 = _mm256_add_epi32(y5, z1);
  |  |  163|  1.78M|            r5 = y5;
  |  |  164|  1.78M|            y5 = _mm256_slli_epi32(y5, 18);
  |  |  165|  1.78M|            z5 = _mm256_xor_si256(z5, y5);
  |  |  166|  1.78M|            r5 = _mm256_srli_epi32(r5, 14);
  |  |  167|  1.78M|            z5 = _mm256_xor_si256(z5, r5);
  |  |  168|       |
  |  |  169|  1.78M|            y14 = z6;
  |  |  170|  1.78M|            y14 = _mm256_add_epi32(y14, z10);
  |  |  171|  1.78M|            r14 = y14;
  |  |  172|  1.78M|            y14 = _mm256_slli_epi32(y14, 7);
  |  |  173|  1.78M|            z14 = _mm256_xor_si256(z14, y14);
  |  |  174|  1.78M|            r14 = _mm256_srli_epi32(r14, 25);
  |  |  175|  1.78M|            z14 = _mm256_xor_si256(z14, r14);
  |  |  176|       |
  |  |  177|  1.78M|            y3 = z11;
  |  |  178|  1.78M|            y3 = _mm256_add_epi32(y3, z15);
  |  |  179|  1.78M|            r3 = y3;
  |  |  180|  1.78M|            y3 = _mm256_slli_epi32(y3, 7);
  |  |  181|  1.78M|            z3 = _mm256_xor_si256(z3, y3);
  |  |  182|  1.78M|            r3 = _mm256_srli_epi32(r3, 25);
  |  |  183|  1.78M|            z3 = _mm256_xor_si256(z3, r3);
  |  |  184|       |
  |  |  185|  1.78M|            y2 = z10;
  |  |  186|  1.78M|            y2 = _mm256_add_epi32(y2, z14);
  |  |  187|  1.78M|            r2 = y2;
  |  |  188|  1.78M|            y2 = _mm256_slli_epi32(y2, 9);
  |  |  189|  1.78M|            z2 = _mm256_xor_si256(z2, y2);
  |  |  190|  1.78M|            r2 = _mm256_srli_epi32(r2, 23);
  |  |  191|  1.78M|            z2 = _mm256_xor_si256(z2, r2);
  |  |  192|       |
  |  |  193|  1.78M|            y7 = z15;
  |  |  194|  1.78M|            y7 = _mm256_add_epi32(y7, z3);
  |  |  195|  1.78M|            r7 = y7;
  |  |  196|  1.78M|            y7 = _mm256_slli_epi32(y7, 9);
  |  |  197|  1.78M|            z7 = _mm256_xor_si256(z7, y7);
  |  |  198|  1.78M|            r7 = _mm256_srli_epi32(r7, 23);
  |  |  199|  1.78M|            z7 = _mm256_xor_si256(z7, r7);
  |  |  200|       |
  |  |  201|  1.78M|            y6 = z14;
  |  |  202|  1.78M|            y6 = _mm256_add_epi32(y6, z2);
  |  |  203|  1.78M|            r6 = y6;
  |  |  204|  1.78M|            y6 = _mm256_slli_epi32(y6, 13);
  |  |  205|  1.78M|            z6 = _mm256_xor_si256(z6, y6);
  |  |  206|  1.78M|            r6 = _mm256_srli_epi32(r6, 19);
  |  |  207|  1.78M|            z6 = _mm256_xor_si256(z6, r6);
  |  |  208|       |
  |  |  209|  1.78M|            y11 = z3;
  |  |  210|  1.78M|            y11 = _mm256_add_epi32(y11, z7);
  |  |  211|  1.78M|            r11 = y11;
  |  |  212|  1.78M|            y11 = _mm256_slli_epi32(y11, 13);
  |  |  213|  1.78M|            z11 = _mm256_xor_si256(z11, y11);
  |  |  214|  1.78M|            r11 = _mm256_srli_epi32(r11, 19);
  |  |  215|  1.78M|            z11 = _mm256_xor_si256(z11, r11);
  |  |  216|       |
  |  |  217|  1.78M|            y10 = z2;
  |  |  218|  1.78M|            y10 = _mm256_add_epi32(y10, z6);
  |  |  219|  1.78M|            r10 = y10;
  |  |  220|  1.78M|            y10 = _mm256_slli_epi32(y10, 18);
  |  |  221|  1.78M|            z10 = _mm256_xor_si256(z10, y10);
  |  |  222|  1.78M|            r10 = _mm256_srli_epi32(r10, 14);
  |  |  223|  1.78M|            z10 = _mm256_xor_si256(z10, r10);
  |  |  224|       |
  |  |  225|  1.78M|            y1 = z3;
  |  |  226|  1.78M|            y1 = _mm256_add_epi32(y1, z0);
  |  |  227|  1.78M|            r1 = y1;
  |  |  228|  1.78M|            y1 = _mm256_slli_epi32(y1, 7);
  |  |  229|  1.78M|            z1 = _mm256_xor_si256(z1, y1);
  |  |  230|  1.78M|            r1 = _mm256_srli_epi32(r1, 25);
  |  |  231|  1.78M|            z1 = _mm256_xor_si256(z1, r1);
  |  |  232|       |
  |  |  233|  1.78M|            y15 = z7;
  |  |  234|  1.78M|            y15 = _mm256_add_epi32(y15, z11);
  |  |  235|  1.78M|            r15 = y15;
  |  |  236|  1.78M|            y15 = _mm256_slli_epi32(y15, 18);
  |  |  237|  1.78M|            z15 = _mm256_xor_si256(z15, y15);
  |  |  238|  1.78M|            r15 = _mm256_srli_epi32(r15, 14);
  |  |  239|  1.78M|            z15 = _mm256_xor_si256(z15, r15);
  |  |  240|       |
  |  |  241|  1.78M|            y6 = z4;
  |  |  242|  1.78M|            y6 = _mm256_add_epi32(y6, z5);
  |  |  243|  1.78M|            r6 = y6;
  |  |  244|  1.78M|            y6 = _mm256_slli_epi32(y6, 7);
  |  |  245|  1.78M|            z6 = _mm256_xor_si256(z6, y6);
  |  |  246|  1.78M|            r6 = _mm256_srli_epi32(r6, 25);
  |  |  247|  1.78M|            z6 = _mm256_xor_si256(z6, r6);
  |  |  248|       |
  |  |  249|  1.78M|            y2 = z0;
  |  |  250|  1.78M|            y2 = _mm256_add_epi32(y2, z1);
  |  |  251|  1.78M|            r2 = y2;
  |  |  252|  1.78M|            y2 = _mm256_slli_epi32(y2, 9);
  |  |  253|  1.78M|            z2 = _mm256_xor_si256(z2, y2);
  |  |  254|  1.78M|            r2 = _mm256_srli_epi32(r2, 23);
  |  |  255|  1.78M|            z2 = _mm256_xor_si256(z2, r2);
  |  |  256|       |
  |  |  257|  1.78M|            y7 = z5;
  |  |  258|  1.78M|            y7 = _mm256_add_epi32(y7, z6);
  |  |  259|  1.78M|            r7 = y7;
  |  |  260|  1.78M|            y7 = _mm256_slli_epi32(y7, 9);
  |  |  261|  1.78M|            z7 = _mm256_xor_si256(z7, y7);
  |  |  262|  1.78M|            r7 = _mm256_srli_epi32(r7, 23);
  |  |  263|  1.78M|            z7 = _mm256_xor_si256(z7, r7);
  |  |  264|       |
  |  |  265|  1.78M|            y3 = z1;
  |  |  266|  1.78M|            y3 = _mm256_add_epi32(y3, z2);
  |  |  267|  1.78M|            r3 = y3;
  |  |  268|  1.78M|            y3 = _mm256_slli_epi32(y3, 13);
  |  |  269|  1.78M|            z3 = _mm256_xor_si256(z3, y3);
  |  |  270|  1.78M|            r3 = _mm256_srli_epi32(r3, 19);
  |  |  271|  1.78M|            z3 = _mm256_xor_si256(z3, r3);
  |  |  272|       |
  |  |  273|  1.78M|            y4 = z6;
  |  |  274|  1.78M|            y4 = _mm256_add_epi32(y4, z7);
  |  |  275|  1.78M|            r4 = y4;
  |  |  276|  1.78M|            y4 = _mm256_slli_epi32(y4, 13);
  |  |  277|  1.78M|            z4 = _mm256_xor_si256(z4, y4);
  |  |  278|  1.78M|            r4 = _mm256_srli_epi32(r4, 19);
  |  |  279|  1.78M|            z4 = _mm256_xor_si256(z4, r4);
  |  |  280|       |
  |  |  281|  1.78M|            y0 = z2;
  |  |  282|  1.78M|            y0 = _mm256_add_epi32(y0, z3);
  |  |  283|  1.78M|            r0 = y0;
  |  |  284|  1.78M|            y0 = _mm256_slli_epi32(y0, 18);
  |  |  285|  1.78M|            z0 = _mm256_xor_si256(z0, y0);
  |  |  286|  1.78M|            r0 = _mm256_srli_epi32(r0, 14);
  |  |  287|  1.78M|            z0 = _mm256_xor_si256(z0, r0);
  |  |  288|       |
  |  |  289|  1.78M|            y5 = z7;
  |  |  290|  1.78M|            y5 = _mm256_add_epi32(y5, z4);
  |  |  291|  1.78M|            r5 = y5;
  |  |  292|  1.78M|            y5 = _mm256_slli_epi32(y5, 18);
  |  |  293|  1.78M|            z5 = _mm256_xor_si256(z5, y5);
  |  |  294|  1.78M|            r5 = _mm256_srli_epi32(r5, 14);
  |  |  295|  1.78M|            z5 = _mm256_xor_si256(z5, r5);
  |  |  296|       |
  |  |  297|  1.78M|            y11 = z9;
  |  |  298|  1.78M|            y11 = _mm256_add_epi32(y11, z10);
  |  |  299|  1.78M|            r11 = y11;
  |  |  300|  1.78M|            y11 = _mm256_slli_epi32(y11, 7);
  |  |  301|  1.78M|            z11 = _mm256_xor_si256(z11, y11);
  |  |  302|  1.78M|            r11 = _mm256_srli_epi32(r11, 25);
  |  |  303|  1.78M|            z11 = _mm256_xor_si256(z11, r11);
  |  |  304|       |
  |  |  305|  1.78M|            y12 = z14;
  |  |  306|  1.78M|            y12 = _mm256_add_epi32(y12, z15);
  |  |  307|  1.78M|            r12 = y12;
  |  |  308|  1.78M|            y12 = _mm256_slli_epi32(y12, 7);
  |  |  309|  1.78M|            z12 = _mm256_xor_si256(z12, y12);
  |  |  310|  1.78M|            r12 = _mm256_srli_epi32(r12, 25);
  |  |  311|  1.78M|            z12 = _mm256_xor_si256(z12, r12);
  |  |  312|       |
  |  |  313|  1.78M|            y8 = z10;
  |  |  314|  1.78M|            y8 = _mm256_add_epi32(y8, z11);
  |  |  315|  1.78M|            r8 = y8;
  |  |  316|  1.78M|            y8 = _mm256_slli_epi32(y8, 9);
  |  |  317|  1.78M|            z8 = _mm256_xor_si256(z8, y8);
  |  |  318|  1.78M|            r8 = _mm256_srli_epi32(r8, 23);
  |  |  319|  1.78M|            z8 = _mm256_xor_si256(z8, r8);
  |  |  320|       |
  |  |  321|  1.78M|            y13 = z15;
  |  |  322|  1.78M|            y13 = _mm256_add_epi32(y13, z12);
  |  |  323|  1.78M|            r13 = y13;
  |  |  324|  1.78M|            y13 = _mm256_slli_epi32(y13, 9);
  |  |  325|  1.78M|            z13 = _mm256_xor_si256(z13, y13);
  |  |  326|  1.78M|            r13 = _mm256_srli_epi32(r13, 23);
  |  |  327|  1.78M|            z13 = _mm256_xor_si256(z13, r13);
  |  |  328|       |
  |  |  329|  1.78M|            y9 = z11;
  |  |  330|  1.78M|            y9 = _mm256_add_epi32(y9, z8);
  |  |  331|  1.78M|            r9 = y9;
  |  |  332|  1.78M|            y9 = _mm256_slli_epi32(y9, 13);
  |  |  333|  1.78M|            z9 = _mm256_xor_si256(z9, y9);
  |  |  334|  1.78M|            r9 = _mm256_srli_epi32(r9, 19);
  |  |  335|  1.78M|            z9 = _mm256_xor_si256(z9, r9);
  |  |  336|       |
  |  |  337|  1.78M|            y14 = z12;
  |  |  338|  1.78M|            y14 = _mm256_add_epi32(y14, z13);
  |  |  339|  1.78M|            r14 = y14;
  |  |  340|  1.78M|            y14 = _mm256_slli_epi32(y14, 13);
  |  |  341|  1.78M|            z14 = _mm256_xor_si256(z14, y14);
  |  |  342|  1.78M|            r14 = _mm256_srli_epi32(r14, 19);
  |  |  343|  1.78M|            z14 = _mm256_xor_si256(z14, r14);
  |  |  344|       |
  |  |  345|  1.78M|            y10 = z8;
  |  |  346|  1.78M|            y10 = _mm256_add_epi32(y10, z9);
  |  |  347|  1.78M|            r10 = y10;
  |  |  348|  1.78M|            y10 = _mm256_slli_epi32(y10, 18);
  |  |  349|  1.78M|            z10 = _mm256_xor_si256(z10, y10);
  |  |  350|  1.78M|            r10 = _mm256_srli_epi32(r10, 14);
  |  |  351|  1.78M|            z10 = _mm256_xor_si256(z10, r10);
  |  |  352|       |
  |  |  353|  1.78M|            y15 = z13;
  |  |  354|  1.78M|            y15 = _mm256_add_epi32(y15, z14);
  |  |  355|  1.78M|            r15 = y15;
  |  |  356|  1.78M|            y15 = _mm256_slli_epi32(y15, 18);
  |  |  357|  1.78M|            z15 = _mm256_xor_si256(z15, y15);
  |  |  358|  1.78M|            r15 = _mm256_srli_epi32(r15, 14);
  |  |  359|  1.78M|            z15 = _mm256_xor_si256(z15, r15);
  |  |  360|  1.78M|        }
  |  |  361|       |
  |  |  362|       |/* store data ; this macro first transpose data in-registers, and then store
  |  |  363|       | * them in memory. much faster with icc. */
  |  |  364|   178k|#define ONEQUAD_TRANSPOSE(A, B, C, D)                                    \
  |  |  365|   178k|    {                                                                    \
  |  |  366|   178k|        __m128i t0, t1, t2, t3;                                          \
  |  |  367|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);                          \
  |  |  368|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);                          \
  |  |  369|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);                          \
  |  |  370|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);                          \
  |  |  371|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B);                        \
  |  |  372|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D);                        \
  |  |  373|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B);                        \
  |  |  374|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D);                        \
  |  |  375|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B);                        \
  |  |  376|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B);                        \
  |  |  377|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D);                        \
  |  |  378|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D);                        \
  |  |  379|   178k|        t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),          \
  |  |  380|   178k|                           _mm_loadu_si128((const __m128i*) (m + 0))); \
  |  |  381|   178k|        _mm_storeu_si128((__m128i*) (c + 0), t0);                        \
  |  |  382|   178k|        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),            \
  |  |  383|   178k|                           _mm_loadu_si128((const __m128i*) (m + 64)));  \
  |  |  384|   178k|        _mm_storeu_si128((__m128i*) (c + 64), t1);                       \
  |  |  385|   178k|        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),            \
  |  |  386|   178k|                           _mm_loadu_si128((const __m128i*) (m + 128))); \
  |  |  387|   178k|        _mm_storeu_si128((__m128i*) (c + 128), t2);                      \
  |  |  388|   178k|        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),            \
  |  |  389|   178k|                           _mm_loadu_si128((const __m128i*) (m + 192))); \
  |  |  390|   178k|        _mm_storeu_si128((__m128i*) (c + 192), t3);                      \
  |  |  391|   178k|        t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),            \
  |  |  392|   178k|                           _mm_loadu_si128((const __m128i*) (m + 256))); \
  |  |  393|   178k|        _mm_storeu_si128((__m128i*) (c + 256), t0);                      \
  |  |  394|   178k|        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),            \
  |  |  395|   178k|                           _mm_loadu_si128((const __m128i*) (m + 320))); \
  |  |  396|   178k|        _mm_storeu_si128((__m128i*) (c + 320), t1);                      \
  |  |  397|   178k|        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),            \
  |  |  398|   178k|                           _mm_loadu_si128((const __m128i*) (m + 384))); \
  |  |  399|   178k|        _mm_storeu_si128((__m128i*) (c + 384), t2);                      \
  |  |  400|   178k|        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),            \
  |  |  401|   178k|                           _mm_loadu_si128((const __m128i*) (m + 448))); \
  |  |  402|   178k|        _mm_storeu_si128((__m128i*) (c + 448), t3);                      \
  |  |  403|   178k|    }
  |  |  404|       |
  |  |  405|   178k|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  406|       |
  |  |  407|   178k|#define ONEQUAD_UNPCK(A, B, C, D)                 \
  |  |  408|   178k|    {                                             \
  |  |  409|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);   \
  |  |  410|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);   \
  |  |  411|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);   \
  |  |  412|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);   \
  |  |  413|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
  |  |  414|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
  |  |  415|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
  |  |  416|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
  |  |  417|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
  |  |  418|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
  |  |  419|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
  |  |  420|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
  |  |  421|   178k|    }
  |  |  422|       |
  |  |  423|   178k|#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                    \
  |  |  424|   178k|    {                                                                          \
  |  |  425|   178k|        ONEQUAD_UNPCK(A, B, C, D);                                             \
  |  |  426|   178k|        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
  |  |  427|   178k|        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                  \
  |  |  428|   178k|        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                  \
  |  |  429|   178k|        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                  \
  |  |  430|   178k|        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                  \
  |  |  431|   178k|        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                  \
  |  |  432|   178k|        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                  \
  |  |  433|   178k|        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                  \
  |  |  434|   178k|        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                  \
  |  |  435|   178k|        y##A  = _mm256_xor_si256(y##A,                                         \
  |  |  436|   178k|                                _mm256_loadu_si256((const __m256i*) (m + 0))); \
  |  |  437|   178k|        y##B  = _mm256_xor_si256(                                              \
  |  |  438|   178k|            y##B, _mm256_loadu_si256((const __m256i*) (m + 64)));              \
  |  |  439|   178k|        y##C = _mm256_xor_si256(                                               \
  |  |  440|   178k|            y##C, _mm256_loadu_si256((const __m256i*) (m + 128)));             \
  |  |  441|   178k|        y##D = _mm256_xor_si256(                                               \
  |  |  442|   178k|            y##D, _mm256_loadu_si256((const __m256i*) (m + 192)));             \
  |  |  443|   178k|        y##A2 = _mm256_xor_si256(                                              \
  |  |  444|   178k|            y##A2, _mm256_loadu_si256((const __m256i*) (m + 256)));            \
  |  |  445|   178k|        y##B2 = _mm256_xor_si256(                                              \
  |  |  446|   178k|            y##B2, _mm256_loadu_si256((const __m256i*) (m + 320)));            \
  |  |  447|   178k|        y##C2 = _mm256_xor_si256(                                              \
  |  |  448|   178k|            y##C2, _mm256_loadu_si256((const __m256i*) (m + 384)));            \
  |  |  449|   178k|        y##D2 = _mm256_xor_si256(                                              \
  |  |  450|   178k|            y##D2, _mm256_loadu_si256((const __m256i*) (m + 448)));            \
  |  |  451|   178k|        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                         \
  |  |  452|   178k|        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                        \
  |  |  453|   178k|        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                       \
  |  |  454|   178k|        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                       \
  |  |  455|   178k|        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                      \
  |  |  456|   178k|        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                      \
  |  |  457|   178k|        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                      \
  |  |  458|   178k|        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                      \
  |  |  459|   178k|    }
  |  |  460|       |
  |  |  461|   178k|        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
  |  |  ------------------
  |  |  |  |  424|   178k|    {                                                                          \
  |  |  |  |  425|   178k|        ONEQUAD_UNPCK(A, B, C, D);                                             \
  |  |  |  |  ------------------
  |  |  |  |  |  |  408|   178k|    {                                             \
  |  |  |  |  |  |  409|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);   \
  |  |  |  |  |  |  410|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);   \
  |  |  |  |  |  |  411|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);   \
  |  |  |  |  |  |  412|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);   \
  |  |  |  |  |  |  413|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
  |  |  |  |  |  |  414|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
  |  |  |  |  |  |  415|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
  |  |  |  |  |  |  416|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
  |  |  |  |  |  |  417|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
  |  |  |  |  |  |  418|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
  |  |  |  |  |  |  419|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
  |  |  |  |  |  |  420|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
  |  |  |  |  |  |  421|   178k|    }
  |  |  |  |  ------------------
  |  |  |  |  426|   178k|        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
  |  |  |  |  ------------------
  |  |  |  |  |  |  408|   178k|    {                                             \
  |  |  |  |  |  |  409|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);   \
  |  |  |  |  |  |  410|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);   \
  |  |  |  |  |  |  411|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);   \
  |  |  |  |  |  |  412|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);   \
  |  |  |  |  |  |  413|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
  |  |  |  |  |  |  414|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
  |  |  |  |  |  |  415|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
  |  |  |  |  |  |  416|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
  |  |  |  |  |  |  417|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
  |  |  |  |  |  |  418|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
  |  |  |  |  |  |  419|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
  |  |  |  |  |  |  420|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
  |  |  |  |  |  |  421|   178k|    }
  |  |  |  |  ------------------
  |  |  |  |  427|   178k|        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                  \
  |  |  |  |  428|   178k|        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                  \
  |  |  |  |  429|   178k|        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                  \
  |  |  |  |  430|   178k|        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                  \
  |  |  |  |  431|   178k|        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                  \
  |  |  |  |  432|   178k|        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                  \
  |  |  |  |  433|   178k|        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                  \
  |  |  |  |  434|   178k|        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                  \
  |  |  |  |  435|   178k|        y##A  = _mm256_xor_si256(y##A,                                         \
  |  |  |  |  436|   178k|                                _mm256_loadu_si256((const __m256i*) (m + 0))); \
  |  |  |  |  437|   178k|        y##B  = _mm256_xor_si256(                                              \
  |  |  |  |  438|   178k|            y##B, _mm256_loadu_si256((const __m256i*) (m + 64)));              \
  |  |  |  |  439|   178k|        y##C = _mm256_xor_si256(                                               \
  |  |  |  |  440|   178k|            y##C, _mm256_loadu_si256((const __m256i*) (m + 128)));             \
  |  |  |  |  441|   178k|        y##D = _mm256_xor_si256(                                               \
  |  |  |  |  442|   178k|            y##D, _mm256_loadu_si256((const __m256i*) (m + 192)));             \
  |  |  |  |  443|   178k|        y##A2 = _mm256_xor_si256(                                              \
  |  |  |  |  444|   178k|            y##A2, _mm256_loadu_si256((const __m256i*) (m + 256)));            \
  |  |  |  |  445|   178k|        y##B2 = _mm256_xor_si256(                                              \
  |  |  |  |  446|   178k|            y##B2, _mm256_loadu_si256((const __m256i*) (m + 320)));            \
  |  |  |  |  447|   178k|        y##C2 = _mm256_xor_si256(                                              \
  |  |  |  |  448|   178k|            y##C2, _mm256_loadu_si256((const __m256i*) (m + 384)));            \
  |  |  |  |  449|   178k|        y##D2 = _mm256_xor_si256(                                              \
  |  |  |  |  450|   178k|            y##D2, _mm256_loadu_si256((const __m256i*) (m + 448)));            \
  |  |  |  |  451|   178k|        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                         \
  |  |  |  |  452|   178k|        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                        \
  |  |  |  |  453|   178k|        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                       \
  |  |  |  |  454|   178k|        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                       \
  |  |  |  |  455|   178k|        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                      \
  |  |  |  |  456|   178k|        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                      \
  |  |  |  |  457|   178k|        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                      \
  |  |  |  |  458|   178k|        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                      \
  |  |  |  |  459|   178k|    }
  |  |  ------------------
  |  |  462|   178k|        m += 32;
  |  |  463|   178k|        c += 32;
  |  |  464|   178k|        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
  |  |  ------------------
  |  |  |  |  424|   178k|    {                                                                          \
  |  |  |  |  425|   178k|        ONEQUAD_UNPCK(A, B, C, D);                                             \
  |  |  |  |  ------------------
  |  |  |  |  |  |  408|   178k|    {                                             \
  |  |  |  |  |  |  409|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);   \
  |  |  |  |  |  |  410|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);   \
  |  |  |  |  |  |  411|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);   \
  |  |  |  |  |  |  412|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);   \
  |  |  |  |  |  |  413|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
  |  |  |  |  |  |  414|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
  |  |  |  |  |  |  415|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
  |  |  |  |  |  |  416|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
  |  |  |  |  |  |  417|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
  |  |  |  |  |  |  418|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
  |  |  |  |  |  |  419|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
  |  |  |  |  |  |  420|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
  |  |  |  |  |  |  421|   178k|    }
  |  |  |  |  ------------------
  |  |  |  |  426|   178k|        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
  |  |  |  |  ------------------
  |  |  |  |  |  |  408|   178k|    {                                             \
  |  |  |  |  |  |  409|   178k|        z##A = _mm256_add_epi32(z##A, orig##A);   \
  |  |  |  |  |  |  410|   178k|        z##B = _mm256_add_epi32(z##B, orig##B);   \
  |  |  |  |  |  |  411|   178k|        z##C = _mm256_add_epi32(z##C, orig##C);   \
  |  |  |  |  |  |  412|   178k|        z##D = _mm256_add_epi32(z##D, orig##D);   \
  |  |  |  |  |  |  413|   178k|        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
  |  |  |  |  |  |  414|   178k|        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
  |  |  |  |  |  |  415|   178k|        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
  |  |  |  |  |  |  416|   178k|        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
  |  |  |  |  |  |  417|   178k|        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
  |  |  |  |  |  |  418|   178k|        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
  |  |  |  |  |  |  419|   178k|        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
  |  |  |  |  |  |  420|   178k|        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
  |  |  |  |  |  |  421|   178k|    }
  |  |  |  |  ------------------
  |  |  |  |  427|   178k|        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                  \
  |  |  |  |  428|   178k|        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                  \
  |  |  |  |  429|   178k|        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                  \
  |  |  |  |  430|   178k|        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                  \
  |  |  |  |  431|   178k|        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                  \
  |  |  |  |  432|   178k|        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                  \
  |  |  |  |  433|   178k|        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                  \
  |  |  |  |  434|   178k|        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                  \
  |  |  |  |  435|   178k|        y##A  = _mm256_xor_si256(y##A,                                         \
  |  |  |  |  436|   178k|                                _mm256_loadu_si256((const __m256i*) (m + 0))); \
  |  |  |  |  437|   178k|        y##B  = _mm256_xor_si256(                                              \
  |  |  |  |  438|   178k|            y##B, _mm256_loadu_si256((const __m256i*) (m + 64)));              \
  |  |  |  |  439|   178k|        y##C = _mm256_xor_si256(                                               \
  |  |  |  |  440|   178k|            y##C, _mm256_loadu_si256((const __m256i*) (m + 128)));             \
  |  |  |  |  441|   178k|        y##D = _mm256_xor_si256(                                               \
  |  |  |  |  442|   178k|            y##D, _mm256_loadu_si256((const __m256i*) (m + 192)));             \
  |  |  |  |  443|   178k|        y##A2 = _mm256_xor_si256(                                              \
  |  |  |  |  444|   178k|            y##A2, _mm256_loadu_si256((const __m256i*) (m + 256)));            \
  |  |  |  |  445|   178k|        y##B2 = _mm256_xor_si256(                                              \
  |  |  |  |  446|   178k|            y##B2, _mm256_loadu_si256((const __m256i*) (m + 320)));            \
  |  |  |  |  447|   178k|        y##C2 = _mm256_xor_si256(                                              \
  |  |  |  |  448|   178k|            y##C2, _mm256_loadu_si256((const __m256i*) (m + 384)));            \
  |  |  |  |  449|   178k|        y##D2 = _mm256_xor_si256(                                              \
  |  |  |  |  450|   178k|            y##D2, _mm256_loadu_si256((const __m256i*) (m + 448)));            \
  |  |  |  |  451|   178k|        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                         \
  |  |  |  |  452|   178k|        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                        \
  |  |  |  |  453|   178k|        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                       \
  |  |  |  |  454|   178k|        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                       \
  |  |  |  |  455|   178k|        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                      \
  |  |  |  |  456|   178k|        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                      \
  |  |  |  |  457|   178k|        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                      \
  |  |  |  |  458|   178k|        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                      \
  |  |  |  |  459|   178k|    }
  |  |  ------------------
  |  |  465|   178k|        m -= 32;
  |  |  466|   178k|        c -= 32;
  |  |  467|       |
  |  |  468|   178k|#undef ONEQUAD
  |  |  469|   178k|#undef ONEQUAD_TRANSPOSE
  |  |  470|   178k|#undef ONEQUAD_UNPCK
  |  |  471|   178k|#undef ONEOCTO
  |  |  472|       |
  |  |  473|   178k|        bytes -= 512;
  |  |  474|   178k|        c += 512;
  |  |  475|   178k|        m += 512;
  |  |  476|   178k|    }
  |  |  477|    368|}
  ------------------
   75|  1.43k|#include "u4.h"
  ------------------
  |  |    1|  1.43k|if (bytes >= 256) {
  |  |  ------------------
  |  |  |  Branch (1:5): [True: 342, False: 1.09k]
  |  |  ------------------
  |  |    2|    342|    __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
  |  |    3|    342|        y15;
  |  |    4|    342|    __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
  |  |    5|    342|        z15;
  |  |    6|    342|    __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
  |  |    7|    342|        orig9, orig10, orig11, orig12, orig13, orig14, orig15;
  |  |    8|       |
  |  |    9|    342|    uint32_t in8;
  |  |   10|    342|    uint32_t in9;
  |  |   11|    342|    int      i;
  |  |   12|       |
  |  |   13|       |    /* element broadcast immediate for _mm_shuffle_epi32 are in order:
  |  |   14|       |       0x00, 0x55, 0xaa, 0xff */
  |  |   15|    342|    z0  = _mm_loadu_si128((const __m128i *) (x + 0));
  |  |   16|    342|    z5  = _mm_shuffle_epi32(z0, 0x55);
  |  |   17|    342|    z10 = _mm_shuffle_epi32(z0, 0xaa);
  |  |   18|    342|    z15 = _mm_shuffle_epi32(z0, 0xff);
  |  |   19|    342|    z0  = _mm_shuffle_epi32(z0, 0x00);
  |  |   20|    342|    z1  = _mm_loadu_si128((const __m128i *) (x + 4));
  |  |   21|    342|    z6  = _mm_shuffle_epi32(z1, 0xaa);
  |  |   22|    342|    z11 = _mm_shuffle_epi32(z1, 0xff);
  |  |   23|    342|    z12 = _mm_shuffle_epi32(z1, 0x00);
  |  |   24|    342|    z1  = _mm_shuffle_epi32(z1, 0x55);
  |  |   25|    342|    z2  = _mm_loadu_si128((const __m128i *) (x + 8));
  |  |   26|    342|    z7  = _mm_shuffle_epi32(z2, 0xff);
  |  |   27|    342|    z13 = _mm_shuffle_epi32(z2, 0x55);
  |  |   28|    342|    z2  = _mm_shuffle_epi32(z2, 0xaa);
  |  |   29|       |    /* no z8 -> first half of the nonce, will fill later */
  |  |   30|    342|    z3  = _mm_loadu_si128((const __m128i *) (x + 12));
  |  |   31|    342|    z4  = _mm_shuffle_epi32(z3, 0x00);
  |  |   32|    342|    z14 = _mm_shuffle_epi32(z3, 0xaa);
  |  |   33|    342|    z3  = _mm_shuffle_epi32(z3, 0xff);
  |  |   34|       |    /* no z9 -> second half of the nonce, will fill later */
  |  |   35|    342|    orig0  = z0;
  |  |   36|    342|    orig1  = z1;
  |  |   37|    342|    orig2  = z2;
  |  |   38|    342|    orig3  = z3;
  |  |   39|    342|    orig4  = z4;
  |  |   40|    342|    orig5  = z5;
  |  |   41|    342|    orig6  = z6;
  |  |   42|    342|    orig7  = z7;
  |  |   43|    342|    orig10 = z10;
  |  |   44|    342|    orig11 = z11;
  |  |   45|    342|    orig12 = z12;
  |  |   46|    342|    orig13 = z13;
  |  |   47|    342|    orig14 = z14;
  |  |   48|    342|    orig15 = z15;
  |  |   49|       |
  |  |   50|    684|    while (bytes >= 256) {
  |  |  ------------------
  |  |  |  Branch (50:12): [True: 342, False: 342]
  |  |  ------------------
  |  |   51|       |        /* vector implementation for z8 and z9 */
  |  |   52|       |        /* not sure if it helps for only 4 blocks */
  |  |   53|    342|        const __m128i addv8 = _mm_set_epi64x(1, 0);
  |  |   54|    342|        const __m128i addv9 = _mm_set_epi64x(3, 2);
  |  |   55|    342|        __m128i       t8, t9;
  |  |   56|    342|        uint64_t      in89;
  |  |   57|       |
  |  |   58|    342|        in8  = x[8];
  |  |   59|    342|        in9  = x[13];
  |  |   60|    342|        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
  |  |   61|    342|        t8   = _mm_set1_epi64x(in89);
  |  |   62|    342|        t9   = _mm_set1_epi64x(in89);
  |  |   63|       |
  |  |   64|    342|        z8 = _mm_add_epi64(addv8, t8);
  |  |   65|    342|        z9 = _mm_add_epi64(addv9, t9);
  |  |   66|       |
  |  |   67|    342|        t8 = _mm_unpacklo_epi32(z8, z9);
  |  |   68|    342|        t9 = _mm_unpackhi_epi32(z8, z9);
  |  |   69|       |
  |  |   70|    342|        z8 = _mm_unpacklo_epi32(t8, t9);
  |  |   71|    342|        z9 = _mm_unpackhi_epi32(t8, t9);
  |  |   72|       |
  |  |   73|    342|        orig8 = z8;
  |  |   74|    342|        orig9 = z9;
  |  |   75|       |
  |  |   76|    342|        in89 += 4;
  |  |   77|       |
  |  |   78|    342|        x[8]  = in89 & 0xFFFFFFFF;
  |  |   79|    342|        x[13] = (in89 >> 32) & 0xFFFFFFFF;
  |  |   80|       |
  |  |   81|    342|        z5  = orig5;
  |  |   82|    342|        z10 = orig10;
  |  |   83|    342|        z15 = orig15;
  |  |   84|    342|        z14 = orig14;
  |  |   85|    342|        z3  = orig3;
  |  |   86|    342|        z6  = orig6;
  |  |   87|    342|        z11 = orig11;
  |  |   88|    342|        z1  = orig1;
  |  |   89|       |
  |  |   90|    342|        z7  = orig7;
  |  |   91|    342|        z13 = orig13;
  |  |   92|    342|        z2  = orig2;
  |  |   93|    342|        z9  = orig9;
  |  |   94|    342|        z0  = orig0;
  |  |   95|    342|        z12 = orig12;
  |  |   96|    342|        z4  = orig4;
  |  |   97|    342|        z8  = orig8;
  |  |   98|       |
  |  |   99|  3.76k|        for (i = 0; i < ROUNDS; i += 2) {
  |  |  ------------------
  |  |  |  |   28|  3.76k|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (99:21): [True: 3.42k, False: 342]
  |  |  ------------------
  |  |  100|       |            /* the inner loop is a direct translation (regexp search/replace)
  |  |  101|       |             * from the amd64-xmm6 ASM */
  |  |  102|  3.42k|            __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
  |  |  103|  3.42k|                r14, r15;
  |  |  104|       |
  |  |  105|  3.42k|            y4 = z12;
  |  |  106|  3.42k|            y4 = _mm_add_epi32(y4, z0);
  |  |  107|  3.42k|            r4 = y4;
  |  |  108|  3.42k|            y4 = _mm_slli_epi32(y4, 7);
  |  |  109|  3.42k|            z4 = _mm_xor_si128(z4, y4);
  |  |  110|  3.42k|            r4 = _mm_srli_epi32(r4, 25);
  |  |  111|  3.42k|            z4 = _mm_xor_si128(z4, r4);
  |  |  112|       |
  |  |  113|  3.42k|            y9 = z1;
  |  |  114|  3.42k|            y9 = _mm_add_epi32(y9, z5);
  |  |  115|  3.42k|            r9 = y9;
  |  |  116|  3.42k|            y9 = _mm_slli_epi32(y9, 7);
  |  |  117|  3.42k|            z9 = _mm_xor_si128(z9, y9);
  |  |  118|  3.42k|            r9 = _mm_srli_epi32(r9, 25);
  |  |  119|  3.42k|            z9 = _mm_xor_si128(z9, r9);
  |  |  120|       |
  |  |  121|  3.42k|            y8 = z0;
  |  |  122|  3.42k|            y8 = _mm_add_epi32(y8, z4);
  |  |  123|  3.42k|            r8 = y8;
  |  |  124|  3.42k|            y8 = _mm_slli_epi32(y8, 9);
  |  |  125|  3.42k|            z8 = _mm_xor_si128(z8, y8);
  |  |  126|  3.42k|            r8 = _mm_srli_epi32(r8, 23);
  |  |  127|  3.42k|            z8 = _mm_xor_si128(z8, r8);
  |  |  128|       |
  |  |  129|  3.42k|            y13 = z5;
  |  |  130|  3.42k|            y13 = _mm_add_epi32(y13, z9);
  |  |  131|  3.42k|            r13 = y13;
  |  |  132|  3.42k|            y13 = _mm_slli_epi32(y13, 9);
  |  |  133|  3.42k|            z13 = _mm_xor_si128(z13, y13);
  |  |  134|  3.42k|            r13 = _mm_srli_epi32(r13, 23);
  |  |  135|  3.42k|            z13 = _mm_xor_si128(z13, r13);
  |  |  136|       |
  |  |  137|  3.42k|            y12 = z4;
  |  |  138|  3.42k|            y12 = _mm_add_epi32(y12, z8);
  |  |  139|  3.42k|            r12 = y12;
  |  |  140|  3.42k|            y12 = _mm_slli_epi32(y12, 13);
  |  |  141|  3.42k|            z12 = _mm_xor_si128(z12, y12);
  |  |  142|  3.42k|            r12 = _mm_srli_epi32(r12, 19);
  |  |  143|  3.42k|            z12 = _mm_xor_si128(z12, r12);
  |  |  144|       |
  |  |  145|  3.42k|            y1 = z9;
  |  |  146|  3.42k|            y1 = _mm_add_epi32(y1, z13);
  |  |  147|  3.42k|            r1 = y1;
  |  |  148|  3.42k|            y1 = _mm_slli_epi32(y1, 13);
  |  |  149|  3.42k|            z1 = _mm_xor_si128(z1, y1);
  |  |  150|  3.42k|            r1 = _mm_srli_epi32(r1, 19);
  |  |  151|  3.42k|            z1 = _mm_xor_si128(z1, r1);
  |  |  152|       |
  |  |  153|  3.42k|            y0 = z8;
  |  |  154|  3.42k|            y0 = _mm_add_epi32(y0, z12);
  |  |  155|  3.42k|            r0 = y0;
  |  |  156|  3.42k|            y0 = _mm_slli_epi32(y0, 18);
  |  |  157|  3.42k|            z0 = _mm_xor_si128(z0, y0);
  |  |  158|  3.42k|            r0 = _mm_srli_epi32(r0, 14);
  |  |  159|  3.42k|            z0 = _mm_xor_si128(z0, r0);
  |  |  160|       |
  |  |  161|  3.42k|            y5 = z13;
  |  |  162|  3.42k|            y5 = _mm_add_epi32(y5, z1);
  |  |  163|  3.42k|            r5 = y5;
  |  |  164|  3.42k|            y5 = _mm_slli_epi32(y5, 18);
  |  |  165|  3.42k|            z5 = _mm_xor_si128(z5, y5);
  |  |  166|  3.42k|            r5 = _mm_srli_epi32(r5, 14);
  |  |  167|  3.42k|            z5 = _mm_xor_si128(z5, r5);
  |  |  168|       |
  |  |  169|  3.42k|            y14 = z6;
  |  |  170|  3.42k|            y14 = _mm_add_epi32(y14, z10);
  |  |  171|  3.42k|            r14 = y14;
  |  |  172|  3.42k|            y14 = _mm_slli_epi32(y14, 7);
  |  |  173|  3.42k|            z14 = _mm_xor_si128(z14, y14);
  |  |  174|  3.42k|            r14 = _mm_srli_epi32(r14, 25);
  |  |  175|  3.42k|            z14 = _mm_xor_si128(z14, r14);
  |  |  176|       |
  |  |  177|  3.42k|            y3 = z11;
  |  |  178|  3.42k|            y3 = _mm_add_epi32(y3, z15);
  |  |  179|  3.42k|            r3 = y3;
  |  |  180|  3.42k|            y3 = _mm_slli_epi32(y3, 7);
  |  |  181|  3.42k|            z3 = _mm_xor_si128(z3, y3);
  |  |  182|  3.42k|            r3 = _mm_srli_epi32(r3, 25);
  |  |  183|  3.42k|            z3 = _mm_xor_si128(z3, r3);
  |  |  184|       |
  |  |  185|  3.42k|            y2 = z10;
  |  |  186|  3.42k|            y2 = _mm_add_epi32(y2, z14);
  |  |  187|  3.42k|            r2 = y2;
  |  |  188|  3.42k|            y2 = _mm_slli_epi32(y2, 9);
  |  |  189|  3.42k|            z2 = _mm_xor_si128(z2, y2);
  |  |  190|  3.42k|            r2 = _mm_srli_epi32(r2, 23);
  |  |  191|  3.42k|            z2 = _mm_xor_si128(z2, r2);
  |  |  192|       |
  |  |  193|  3.42k|            y7 = z15;
  |  |  194|  3.42k|            y7 = _mm_add_epi32(y7, z3);
  |  |  195|  3.42k|            r7 = y7;
  |  |  196|  3.42k|            y7 = _mm_slli_epi32(y7, 9);
  |  |  197|  3.42k|            z7 = _mm_xor_si128(z7, y7);
  |  |  198|  3.42k|            r7 = _mm_srli_epi32(r7, 23);
  |  |  199|  3.42k|            z7 = _mm_xor_si128(z7, r7);
  |  |  200|       |
  |  |  201|  3.42k|            y6 = z14;
  |  |  202|  3.42k|            y6 = _mm_add_epi32(y6, z2);
  |  |  203|  3.42k|            r6 = y6;
  |  |  204|  3.42k|            y6 = _mm_slli_epi32(y6, 13);
  |  |  205|  3.42k|            z6 = _mm_xor_si128(z6, y6);
  |  |  206|  3.42k|            r6 = _mm_srli_epi32(r6, 19);
  |  |  207|  3.42k|            z6 = _mm_xor_si128(z6, r6);
  |  |  208|       |
  |  |  209|  3.42k|            y11 = z3;
  |  |  210|  3.42k|            y11 = _mm_add_epi32(y11, z7);
  |  |  211|  3.42k|            r11 = y11;
  |  |  212|  3.42k|            y11 = _mm_slli_epi32(y11, 13);
  |  |  213|  3.42k|            z11 = _mm_xor_si128(z11, y11);
  |  |  214|  3.42k|            r11 = _mm_srli_epi32(r11, 19);
  |  |  215|  3.42k|            z11 = _mm_xor_si128(z11, r11);
  |  |  216|       |
  |  |  217|  3.42k|            y10 = z2;
  |  |  218|  3.42k|            y10 = _mm_add_epi32(y10, z6);
  |  |  219|  3.42k|            r10 = y10;
  |  |  220|  3.42k|            y10 = _mm_slli_epi32(y10, 18);
  |  |  221|  3.42k|            z10 = _mm_xor_si128(z10, y10);
  |  |  222|  3.42k|            r10 = _mm_srli_epi32(r10, 14);
  |  |  223|  3.42k|            z10 = _mm_xor_si128(z10, r10);
  |  |  224|       |
  |  |  225|  3.42k|            y1 = z3;
  |  |  226|  3.42k|            y1 = _mm_add_epi32(y1, z0);
  |  |  227|  3.42k|            r1 = y1;
  |  |  228|  3.42k|            y1 = _mm_slli_epi32(y1, 7);
  |  |  229|  3.42k|            z1 = _mm_xor_si128(z1, y1);
  |  |  230|  3.42k|            r1 = _mm_srli_epi32(r1, 25);
  |  |  231|  3.42k|            z1 = _mm_xor_si128(z1, r1);
  |  |  232|       |
  |  |  233|  3.42k|            y15 = z7;
  |  |  234|  3.42k|            y15 = _mm_add_epi32(y15, z11);
  |  |  235|  3.42k|            r15 = y15;
  |  |  236|  3.42k|            y15 = _mm_slli_epi32(y15, 18);
  |  |  237|  3.42k|            z15 = _mm_xor_si128(z15, y15);
  |  |  238|  3.42k|            r15 = _mm_srli_epi32(r15, 14);
  |  |  239|  3.42k|            z15 = _mm_xor_si128(z15, r15);
  |  |  240|       |
  |  |  241|  3.42k|            y6 = z4;
  |  |  242|  3.42k|            y6 = _mm_add_epi32(y6, z5);
  |  |  243|  3.42k|            r6 = y6;
  |  |  244|  3.42k|            y6 = _mm_slli_epi32(y6, 7);
  |  |  245|  3.42k|            z6 = _mm_xor_si128(z6, y6);
  |  |  246|  3.42k|            r6 = _mm_srli_epi32(r6, 25);
  |  |  247|  3.42k|            z6 = _mm_xor_si128(z6, r6);
  |  |  248|       |
  |  |  249|  3.42k|            y2 = z0;
  |  |  250|  3.42k|            y2 = _mm_add_epi32(y2, z1);
  |  |  251|  3.42k|            r2 = y2;
  |  |  252|  3.42k|            y2 = _mm_slli_epi32(y2, 9);
  |  |  253|  3.42k|            z2 = _mm_xor_si128(z2, y2);
  |  |  254|  3.42k|            r2 = _mm_srli_epi32(r2, 23);
  |  |  255|  3.42k|            z2 = _mm_xor_si128(z2, r2);
  |  |  256|       |
  |  |  257|  3.42k|            y7 = z5;
  |  |  258|  3.42k|            y7 = _mm_add_epi32(y7, z6);
  |  |  259|  3.42k|            r7 = y7;
  |  |  260|  3.42k|            y7 = _mm_slli_epi32(y7, 9);
  |  |  261|  3.42k|            z7 = _mm_xor_si128(z7, y7);
  |  |  262|  3.42k|            r7 = _mm_srli_epi32(r7, 23);
  |  |  263|  3.42k|            z7 = _mm_xor_si128(z7, r7);
  |  |  264|       |
  |  |  265|  3.42k|            y3 = z1;
  |  |  266|  3.42k|            y3 = _mm_add_epi32(y3, z2);
  |  |  267|  3.42k|            r3 = y3;
  |  |  268|  3.42k|            y3 = _mm_slli_epi32(y3, 13);
  |  |  269|  3.42k|            z3 = _mm_xor_si128(z3, y3);
  |  |  270|  3.42k|            r3 = _mm_srli_epi32(r3, 19);
  |  |  271|  3.42k|            z3 = _mm_xor_si128(z3, r3);
  |  |  272|       |
  |  |  273|  3.42k|            y4 = z6;
  |  |  274|  3.42k|            y4 = _mm_add_epi32(y4, z7);
  |  |  275|  3.42k|            r4 = y4;
  |  |  276|  3.42k|            y4 = _mm_slli_epi32(y4, 13);
  |  |  277|  3.42k|            z4 = _mm_xor_si128(z4, y4);
  |  |  278|  3.42k|            r4 = _mm_srli_epi32(r4, 19);
  |  |  279|  3.42k|            z4 = _mm_xor_si128(z4, r4);
  |  |  280|       |
  |  |  281|  3.42k|            y0 = z2;
  |  |  282|  3.42k|            y0 = _mm_add_epi32(y0, z3);
  |  |  283|  3.42k|            r0 = y0;
  |  |  284|  3.42k|            y0 = _mm_slli_epi32(y0, 18);
  |  |  285|  3.42k|            z0 = _mm_xor_si128(z0, y0);
  |  |  286|  3.42k|            r0 = _mm_srli_epi32(r0, 14);
  |  |  287|  3.42k|            z0 = _mm_xor_si128(z0, r0);
  |  |  288|       |
  |  |  289|  3.42k|            y5 = z7;
  |  |  290|  3.42k|            y5 = _mm_add_epi32(y5, z4);
  |  |  291|  3.42k|            r5 = y5;
  |  |  292|  3.42k|            y5 = _mm_slli_epi32(y5, 18);
  |  |  293|  3.42k|            z5 = _mm_xor_si128(z5, y5);
  |  |  294|  3.42k|            r5 = _mm_srli_epi32(r5, 14);
  |  |  295|  3.42k|            z5 = _mm_xor_si128(z5, r5);
  |  |  296|       |
  |  |  297|  3.42k|            y11 = z9;
  |  |  298|  3.42k|            y11 = _mm_add_epi32(y11, z10);
  |  |  299|  3.42k|            r11 = y11;
  |  |  300|  3.42k|            y11 = _mm_slli_epi32(y11, 7);
  |  |  301|  3.42k|            z11 = _mm_xor_si128(z11, y11);
  |  |  302|  3.42k|            r11 = _mm_srli_epi32(r11, 25);
  |  |  303|  3.42k|            z11 = _mm_xor_si128(z11, r11);
  |  |  304|       |
  |  |  305|  3.42k|            y12 = z14;
  |  |  306|  3.42k|            y12 = _mm_add_epi32(y12, z15);
  |  |  307|  3.42k|            r12 = y12;
  |  |  308|  3.42k|            y12 = _mm_slli_epi32(y12, 7);
  |  |  309|  3.42k|            z12 = _mm_xor_si128(z12, y12);
  |  |  310|  3.42k|            r12 = _mm_srli_epi32(r12, 25);
  |  |  311|  3.42k|            z12 = _mm_xor_si128(z12, r12);
  |  |  312|       |
  |  |  313|  3.42k|            y8 = z10;
  |  |  314|  3.42k|            y8 = _mm_add_epi32(y8, z11);
  |  |  315|  3.42k|            r8 = y8;
  |  |  316|  3.42k|            y8 = _mm_slli_epi32(y8, 9);
  |  |  317|  3.42k|            z8 = _mm_xor_si128(z8, y8);
  |  |  318|  3.42k|            r8 = _mm_srli_epi32(r8, 23);
  |  |  319|  3.42k|            z8 = _mm_xor_si128(z8, r8);
  |  |  320|       |
  |  |  321|  3.42k|            y13 = z15;
  |  |  322|  3.42k|            y13 = _mm_add_epi32(y13, z12);
  |  |  323|  3.42k|            r13 = y13;
  |  |  324|  3.42k|            y13 = _mm_slli_epi32(y13, 9);
  |  |  325|  3.42k|            z13 = _mm_xor_si128(z13, y13);
  |  |  326|  3.42k|            r13 = _mm_srli_epi32(r13, 23);
  |  |  327|  3.42k|            z13 = _mm_xor_si128(z13, r13);
  |  |  328|       |
  |  |  329|  3.42k|            y9 = z11;
  |  |  330|  3.42k|            y9 = _mm_add_epi32(y9, z8);
  |  |  331|  3.42k|            r9 = y9;
  |  |  332|  3.42k|            y9 = _mm_slli_epi32(y9, 13);
  |  |  333|  3.42k|            z9 = _mm_xor_si128(z9, y9);
  |  |  334|  3.42k|            r9 = _mm_srli_epi32(r9, 19);
  |  |  335|  3.42k|            z9 = _mm_xor_si128(z9, r9);
  |  |  336|       |
  |  |  337|  3.42k|            y14 = z12;
  |  |  338|  3.42k|            y14 = _mm_add_epi32(y14, z13);
  |  |  339|  3.42k|            r14 = y14;
  |  |  340|  3.42k|            y14 = _mm_slli_epi32(y14, 13);
  |  |  341|  3.42k|            z14 = _mm_xor_si128(z14, y14);
  |  |  342|  3.42k|            r14 = _mm_srli_epi32(r14, 19);
  |  |  343|  3.42k|            z14 = _mm_xor_si128(z14, r14);
  |  |  344|       |
  |  |  345|  3.42k|            y10 = z8;
  |  |  346|  3.42k|            y10 = _mm_add_epi32(y10, z9);
  |  |  347|  3.42k|            r10 = y10;
  |  |  348|  3.42k|            y10 = _mm_slli_epi32(y10, 18);
  |  |  349|  3.42k|            z10 = _mm_xor_si128(z10, y10);
  |  |  350|  3.42k|            r10 = _mm_srli_epi32(r10, 14);
  |  |  351|  3.42k|            z10 = _mm_xor_si128(z10, r10);
  |  |  352|       |
  |  |  353|  3.42k|            y15 = z13;
  |  |  354|  3.42k|            y15 = _mm_add_epi32(y15, z14);
  |  |  355|  3.42k|            r15 = y15;
  |  |  356|  3.42k|            y15 = _mm_slli_epi32(y15, 18);
  |  |  357|  3.42k|            z15 = _mm_xor_si128(z15, y15);
  |  |  358|  3.42k|            r15 = _mm_srli_epi32(r15, 14);
  |  |  359|  3.42k|            z15 = _mm_xor_si128(z15, r15);
  |  |  360|  3.42k|        }
  |  |  361|       |
  |  |  362|       |/* store data ; this macro replicates the original amd64-xmm6 code */
  |  |  363|    342|#define ONEQUAD_SHUFFLE(A, B, C, D)        \
  |  |  364|    342|    z##A  = _mm_add_epi32(z##A, orig##A);  \
  |  |  365|    342|    z##B  = _mm_add_epi32(z##B, orig##B);  \
  |  |  366|    342|    z##C  = _mm_add_epi32(z##C, orig##C);  \
  |  |  367|    342|    z##D  = _mm_add_epi32(z##D, orig##D);  \
  |  |  368|    342|    in##A = _mm_cvtsi128_si32(z##A);       \
  |  |  369|    342|    in##B = _mm_cvtsi128_si32(z##B);       \
  |  |  370|    342|    in##C = _mm_cvtsi128_si32(z##C);       \
  |  |  371|    342|    in##D = _mm_cvtsi128_si32(z##D);       \
  |  |  372|    342|    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
  |  |  373|    342|    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
  |  |  374|    342|    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
  |  |  375|    342|    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
  |  |  376|    342|                                           \
  |  |  377|    342|    in##A ^= *(uint32_t *) (m + 0);        \
  |  |  378|    342|    in##B ^= *(uint32_t *) (m + 4);        \
  |  |  379|    342|    in##C ^= *(uint32_t *) (m + 8);        \
  |  |  380|    342|    in##D ^= *(uint32_t *) (m + 12);       \
  |  |  381|    342|                                           \
  |  |  382|    342|    *(uint32_t *) (c + 0)  = in##A;        \
  |  |  383|    342|    *(uint32_t *) (c + 4)  = in##B;        \
  |  |  384|    342|    *(uint32_t *) (c + 8)  = in##C;        \
  |  |  385|    342|    *(uint32_t *) (c + 12) = in##D;        \
  |  |  386|    342|                                           \
  |  |  387|    342|    in##A = _mm_cvtsi128_si32(z##A);       \
  |  |  388|    342|    in##B = _mm_cvtsi128_si32(z##B);       \
  |  |  389|    342|    in##C = _mm_cvtsi128_si32(z##C);       \
  |  |  390|    342|    in##D = _mm_cvtsi128_si32(z##D);       \
  |  |  391|    342|    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
  |  |  392|    342|    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
  |  |  393|    342|    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
  |  |  394|    342|    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
  |  |  395|    342|                                           \
  |  |  396|    342|    in##A ^= *(uint32_t *) (m + 64);       \
  |  |  397|    342|    in##B ^= *(uint32_t *) (m + 68);       \
  |  |  398|    342|    in##C ^= *(uint32_t *) (m + 72);       \
  |  |  399|    342|    in##D ^= *(uint32_t *) (m + 76);       \
  |  |  400|    342|    *(uint32_t *) (c + 64) = in##A;        \
  |  |  401|    342|    *(uint32_t *) (c + 68) = in##B;        \
  |  |  402|    342|    *(uint32_t *) (c + 72) = in##C;        \
  |  |  403|    342|    *(uint32_t *) (c + 76) = in##D;        \
  |  |  404|    342|                                           \
  |  |  405|    342|    in##A = _mm_cvtsi128_si32(z##A);       \
  |  |  406|    342|    in##B = _mm_cvtsi128_si32(z##B);       \
  |  |  407|    342|    in##C = _mm_cvtsi128_si32(z##C);       \
  |  |  408|    342|    in##D = _mm_cvtsi128_si32(z##D);       \
  |  |  409|    342|    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
  |  |  410|    342|    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
  |  |  411|    342|    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
  |  |  412|    342|    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
  |  |  413|    342|                                           \
  |  |  414|    342|    in##A ^= *(uint32_t *) (m + 128);      \
  |  |  415|    342|    in##B ^= *(uint32_t *) (m + 132);      \
  |  |  416|    342|    in##C ^= *(uint32_t *) (m + 136);      \
  |  |  417|    342|    in##D ^= *(uint32_t *) (m + 140);      \
  |  |  418|    342|    *(uint32_t *) (c + 128) = in##A;       \
  |  |  419|    342|    *(uint32_t *) (c + 132) = in##B;       \
  |  |  420|    342|    *(uint32_t *) (c + 136) = in##C;       \
  |  |  421|    342|    *(uint32_t *) (c + 140) = in##D;       \
  |  |  422|    342|                                           \
  |  |  423|    342|    in##A = _mm_cvtsi128_si32(z##A);       \
  |  |  424|    342|    in##B = _mm_cvtsi128_si32(z##B);       \
  |  |  425|    342|    in##C = _mm_cvtsi128_si32(z##C);       \
  |  |  426|    342|    in##D = _mm_cvtsi128_si32(z##D);       \
  |  |  427|    342|                                           \
  |  |  428|    342|    in##A ^= *(uint32_t *) (m + 192);      \
  |  |  429|    342|    in##B ^= *(uint32_t *) (m + 196);      \
  |  |  430|    342|    in##C ^= *(uint32_t *) (m + 200);      \
  |  |  431|    342|    in##D ^= *(uint32_t *) (m + 204);      \
  |  |  432|    342|    *(uint32_t *) (c + 192) = in##A;       \
  |  |  433|    342|    *(uint32_t *) (c + 196) = in##B;       \
  |  |  434|    342|    *(uint32_t *) (c + 200) = in##C;       \
  |  |  435|    342|    *(uint32_t *) (c + 204) = in##D
  |  |  436|       |
  |  |  437|       |/* store data ; this macro replaces shuffle+mov by a direct extract; not much
  |  |  438|       | * difference */
  |  |  439|    342|#define ONEQUAD_EXTRACT(A, B, C, D)       \
  |  |  440|    342|    z##A  = _mm_add_epi32(z##A, orig##A); \
  |  |  441|    342|    z##B  = _mm_add_epi32(z##B, orig##B); \
  |  |  442|    342|    z##C  = _mm_add_epi32(z##C, orig##C); \
  |  |  443|    342|    z##D  = _mm_add_epi32(z##D, orig##D); \
  |  |  444|    342|    in##A = _mm_cvtsi128_si32(z##A);      \
  |  |  445|    342|    in##B = _mm_cvtsi128_si32(z##B);      \
  |  |  446|    342|    in##C = _mm_cvtsi128_si32(z##C);      \
  |  |  447|    342|    in##D = _mm_cvtsi128_si32(z##D);      \
  |  |  448|    342|    in##A ^= *(uint32_t *) (m + 0);       \
  |  |  449|    342|    in##B ^= *(uint32_t *) (m + 4);       \
  |  |  450|    342|    in##C ^= *(uint32_t *) (m + 8);       \
  |  |  451|    342|    in##D ^= *(uint32_t *) (m + 12);      \
  |  |  452|    342|    *(uint32_t *) (c + 0)  = in##A;       \
  |  |  453|    342|    *(uint32_t *) (c + 4)  = in##B;       \
  |  |  454|    342|    *(uint32_t *) (c + 8)  = in##C;       \
  |  |  455|    342|    *(uint32_t *) (c + 12) = in##D;       \
  |  |  456|    342|                                          \
  |  |  457|    342|    in##A = _mm_extract_epi32(z##A, 1);   \
  |  |  458|    342|    in##B = _mm_extract_epi32(z##B, 1);   \
  |  |  459|    342|    in##C = _mm_extract_epi32(z##C, 1);   \
  |  |  460|    342|    in##D = _mm_extract_epi32(z##D, 1);   \
  |  |  461|    342|                                          \
  |  |  462|    342|    in##A ^= *(uint32_t *) (m + 64);      \
  |  |  463|    342|    in##B ^= *(uint32_t *) (m + 68);      \
  |  |  464|    342|    in##C ^= *(uint32_t *) (m + 72);      \
  |  |  465|    342|    in##D ^= *(uint32_t *) (m + 76);      \
  |  |  466|    342|    *(uint32_t *) (c + 64) = in##A;       \
  |  |  467|    342|    *(uint32_t *) (c + 68) = in##B;       \
  |  |  468|    342|    *(uint32_t *) (c + 72) = in##C;       \
  |  |  469|    342|    *(uint32_t *) (c + 76) = in##D;       \
  |  |  470|    342|                                          \
  |  |  471|    342|    in##A = _mm_extract_epi32(z##A, 2);   \
  |  |  472|    342|    in##B = _mm_extract_epi32(z##B, 2);   \
  |  |  473|    342|    in##C = _mm_extract_epi32(z##C, 2);   \
  |  |  474|    342|    in##D = _mm_extract_epi32(z##D, 2);   \
  |  |  475|    342|                                          \
  |  |  476|    342|    in##A ^= *(uint32_t *) (m + 128);     \
  |  |  477|    342|    in##B ^= *(uint32_t *) (m + 132);     \
  |  |  478|    342|    in##C ^= *(uint32_t *) (m + 136);     \
  |  |  479|    342|    in##D ^= *(uint32_t *) (m + 140);     \
  |  |  480|    342|    *(uint32_t *) (c + 128) = in##A;      \
  |  |  481|    342|    *(uint32_t *) (c + 132) = in##B;      \
  |  |  482|    342|    *(uint32_t *) (c + 136) = in##C;      \
  |  |  483|    342|    *(uint32_t *) (c + 140) = in##D;      \
  |  |  484|    342|                                          \
  |  |  485|    342|    in##A = _mm_extract_epi32(z##A, 3);   \
  |  |  486|    342|    in##B = _mm_extract_epi32(z##B, 3);   \
  |  |  487|    342|    in##C = _mm_extract_epi32(z##C, 3);   \
  |  |  488|    342|    in##D = _mm_extract_epi32(z##D, 3);   \
  |  |  489|    342|                                          \
  |  |  490|    342|    in##A ^= *(uint32_t *) (m + 192);     \
  |  |  491|    342|    in##B ^= *(uint32_t *) (m + 196);     \
  |  |  492|    342|    in##C ^= *(uint32_t *) (m + 200);     \
  |  |  493|    342|    in##D ^= *(uint32_t *) (m + 204);     \
  |  |  494|    342|    *(uint32_t *) (c + 192) = in##A;      \
  |  |  495|    342|    *(uint32_t *) (c + 196) = in##B;      \
  |  |  496|    342|    *(uint32_t *) (c + 200) = in##C;      \
  |  |  497|    342|    *(uint32_t *) (c + 204) = in##D
  |  |  498|       |
  |  |  499|       |/* store data ; this macro first transpose data in-registers, and then store
  |  |  500|       | * them in memory. much faster with icc. */
  |  |  501|    342|#define ONEQUAD_TRANSPOSE(A, B, C, D)                                         \
  |  |  502|    342|    z##A = _mm_add_epi32(z##A, orig##A);                                      \
  |  |  503|    342|    z##B = _mm_add_epi32(z##B, orig##B);                                      \
  |  |  504|    342|    z##C = _mm_add_epi32(z##C, orig##C);                                      \
  |  |  505|    342|    z##D = _mm_add_epi32(z##D, orig##D);                                      \
  |  |  506|    342|    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
  |  |  507|    342|    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
  |  |  508|    342|    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
  |  |  509|    342|    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
  |  |  510|    342|    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
  |  |  511|    342|    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
  |  |  512|    342|    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
  |  |  513|    342|    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
  |  |  514|    342|    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
  |  |  515|    342|    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
  |  |  516|    342|    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
  |  |  517|    342|    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
  |  |  518|    342|    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  |  |  519|    342|    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
  |  |  520|    342|    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  |  |  521|    342|    _mm_storeu_si128((__m128i *) (c + 192), y##D)
  |  |  522|       |
  |  |  523|    342|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  524|       |
  |  |  525|    342|        ONEQUAD(0, 1, 2, 3);
  |  |  ------------------
  |  |  |  |  523|    342|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  502|    342|    z##A = _mm_add_epi32(z##A, orig##A);                                      \
  |  |  |  |  |  |  503|    342|    z##B = _mm_add_epi32(z##B, orig##B);                                      \
  |  |  |  |  |  |  504|    342|    z##C = _mm_add_epi32(z##C, orig##C);                                      \
  |  |  |  |  |  |  505|    342|    z##D = _mm_add_epi32(z##D, orig##D);                                      \
  |  |  |  |  |  |  506|    342|    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  507|    342|    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  508|    342|    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  509|    342|    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  510|    342|    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  511|    342|    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  512|    342|    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  513|    342|    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  514|    342|    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
  |  |  |  |  |  |  515|    342|    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
  |  |  |  |  |  |  516|    342|    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
  |  |  |  |  |  |  517|    342|    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
  |  |  |  |  |  |  518|    342|    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  |  |  |  |  |  |  519|    342|    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
  |  |  |  |  |  |  520|    342|    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  |  |  |  |  |  |  521|    342|    _mm_storeu_si128((__m128i *) (c + 192), y##D)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  526|    342|        m += 16;
  |  |  527|    342|        c += 16;
  |  |  528|    342|        ONEQUAD(4, 5, 6, 7);
  |  |  ------------------
  |  |  |  |  523|    342|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  502|    342|    z##A = _mm_add_epi32(z##A, orig##A);                                      \
  |  |  |  |  |  |  503|    342|    z##B = _mm_add_epi32(z##B, orig##B);                                      \
  |  |  |  |  |  |  504|    342|    z##C = _mm_add_epi32(z##C, orig##C);                                      \
  |  |  |  |  |  |  505|    342|    z##D = _mm_add_epi32(z##D, orig##D);                                      \
  |  |  |  |  |  |  506|    342|    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  507|    342|    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  508|    342|    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  509|    342|    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  510|    342|    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  511|    342|    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  512|    342|    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  513|    342|    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  514|    342|    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
  |  |  |  |  |  |  515|    342|    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
  |  |  |  |  |  |  516|    342|    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
  |  |  |  |  |  |  517|    342|    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
  |  |  |  |  |  |  518|    342|    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  |  |  |  |  |  |  519|    342|    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
  |  |  |  |  |  |  520|    342|    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  |  |  |  |  |  |  521|    342|    _mm_storeu_si128((__m128i *) (c + 192), y##D)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  529|    342|        m += 16;
  |  |  530|    342|        c += 16;
  |  |  531|    342|        ONEQUAD(8, 9, 10, 11);
  |  |  ------------------
  |  |  |  |  523|    342|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  502|    342|    z##A = _mm_add_epi32(z##A, orig##A);                                      \
  |  |  |  |  |  |  503|    342|    z##B = _mm_add_epi32(z##B, orig##B);                                      \
  |  |  |  |  |  |  504|    342|    z##C = _mm_add_epi32(z##C, orig##C);                                      \
  |  |  |  |  |  |  505|    342|    z##D = _mm_add_epi32(z##D, orig##D);                                      \
  |  |  |  |  |  |  506|    342|    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  507|    342|    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  508|    342|    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  509|    342|    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  510|    342|    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  511|    342|    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  512|    342|    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  513|    342|    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  514|    342|    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
  |  |  |  |  |  |  515|    342|    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
  |  |  |  |  |  |  516|    342|    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
  |  |  |  |  |  |  517|    342|    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
  |  |  |  |  |  |  518|    342|    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  |  |  |  |  |  |  519|    342|    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
  |  |  |  |  |  |  520|    342|    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  |  |  |  |  |  |  521|    342|    _mm_storeu_si128((__m128i *) (c + 192), y##D)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  532|    342|        m += 16;
  |  |  533|    342|        c += 16;
  |  |  534|    342|        ONEQUAD(12, 13, 14, 15);
  |  |  ------------------
  |  |  |  |  523|    342|#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  502|    342|    z##A = _mm_add_epi32(z##A, orig##A);                                      \
  |  |  |  |  |  |  503|    342|    z##B = _mm_add_epi32(z##B, orig##B);                                      \
  |  |  |  |  |  |  504|    342|    z##C = _mm_add_epi32(z##C, orig##C);                                      \
  |  |  |  |  |  |  505|    342|    z##D = _mm_add_epi32(z##D, orig##D);                                      \
  |  |  |  |  |  |  506|    342|    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  507|    342|    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  508|    342|    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
  |  |  |  |  |  |  509|    342|    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
  |  |  |  |  |  |  510|    342|    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  511|    342|    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
  |  |  |  |  |  |  512|    342|    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  513|    342|    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
  |  |  |  |  |  |  514|    342|    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
  |  |  |  |  |  |  515|    342|    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
  |  |  |  |  |  |  516|    342|    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
  |  |  |  |  |  |  517|    342|    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
  |  |  |  |  |  |  518|    342|    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  |  |  |  |  |  |  519|    342|    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
  |  |  |  |  |  |  520|    342|    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  |  |  |  |  |  |  521|    342|    _mm_storeu_si128((__m128i *) (c + 192), y##D)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  535|    342|        m -= 48;
  |  |  536|    342|        c -= 48;
  |  |  537|       |
  |  |  538|    342|#undef ONEQUAD
  |  |  539|    342|#undef ONEQUAD_TRANSPOSE
  |  |  540|    342|#undef ONEQUAD_EXTRACT
  |  |  541|    342|#undef ONEQUAD_SHUFFLE
  |  |  542|       |
  |  |  543|    342|        bytes -= 256;
  |  |  544|    342|        c += 256;
  |  |  545|    342|        m += 256;
  |  |  546|    342|    }
  |  |  547|    342|}
  ------------------
   76|  1.43k|#include "u1.h"
  ------------------
  |  |    1|  2.90k|while (bytes >= 64) {
  |  |  ------------------
  |  |  |  Branch (1:8): [True: 1.47k, False: 1.43k]
  |  |  ------------------
  |  |    2|  1.47k|    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
  |  |    3|  1.47k|    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
  |  |    4|  1.47k|    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
  |  |    5|  1.47k|    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
  |  |    6|  1.47k|    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
  |  |    7|  1.47k|    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
  |  |    8|       |
  |  |    9|  1.47k|    uint32_t in8;
  |  |   10|  1.47k|    uint32_t in9;
  |  |   11|  1.47k|    int      i;
  |  |   12|       |
  |  |   13|  1.47k|    a0 = diag1;
  |  |   14|  8.82k|    for (i = 0; i < ROUNDS; i += 4) {
  |  |  ------------------
  |  |  |  |   28|  8.82k|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (14:17): [True: 7.35k, False: 1.47k]
  |  |  ------------------
  |  |   15|  7.35k|        a0    = _mm_add_epi32(a0, diag0);
  |  |   16|  7.35k|        a1    = diag0;
  |  |   17|  7.35k|        b0    = a0;
  |  |   18|  7.35k|        a0    = _mm_slli_epi32(a0, 7);
  |  |   19|  7.35k|        b0    = _mm_srli_epi32(b0, 25);
  |  |   20|  7.35k|        diag3 = _mm_xor_si128(diag3, a0);
  |  |   21|       |
  |  |   22|  7.35k|        diag3 = _mm_xor_si128(diag3, b0);
  |  |   23|       |
  |  |   24|  7.35k|        a1    = _mm_add_epi32(a1, diag3);
  |  |   25|  7.35k|        a2    = diag3;
  |  |   26|  7.35k|        b1    = a1;
  |  |   27|  7.35k|        a1    = _mm_slli_epi32(a1, 9);
  |  |   28|  7.35k|        b1    = _mm_srli_epi32(b1, 23);
  |  |   29|  7.35k|        diag2 = _mm_xor_si128(diag2, a1);
  |  |   30|  7.35k|        diag3 = _mm_shuffle_epi32(diag3, 0x93);
  |  |   31|  7.35k|        diag2 = _mm_xor_si128(diag2, b1);
  |  |   32|       |
  |  |   33|  7.35k|        a2    = _mm_add_epi32(a2, diag2);
  |  |   34|  7.35k|        a3    = diag2;
  |  |   35|  7.35k|        b2    = a2;
  |  |   36|  7.35k|        a2    = _mm_slli_epi32(a2, 13);
  |  |   37|  7.35k|        b2    = _mm_srli_epi32(b2, 19);
  |  |   38|  7.35k|        diag1 = _mm_xor_si128(diag1, a2);
  |  |   39|  7.35k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |   40|  7.35k|        diag1 = _mm_xor_si128(diag1, b2);
  |  |   41|       |
  |  |   42|  7.35k|        a3    = _mm_add_epi32(a3, diag1);
  |  |   43|  7.35k|        a4    = diag3;
  |  |   44|  7.35k|        b3    = a3;
  |  |   45|  7.35k|        a3    = _mm_slli_epi32(a3, 18);
  |  |   46|  7.35k|        b3    = _mm_srli_epi32(b3, 14);
  |  |   47|  7.35k|        diag0 = _mm_xor_si128(diag0, a3);
  |  |   48|  7.35k|        diag1 = _mm_shuffle_epi32(diag1, 0x39);
  |  |   49|  7.35k|        diag0 = _mm_xor_si128(diag0, b3);
  |  |   50|       |
  |  |   51|  7.35k|        a4    = _mm_add_epi32(a4, diag0);
  |  |   52|  7.35k|        a5    = diag0;
  |  |   53|  7.35k|        b4    = a4;
  |  |   54|  7.35k|        a4    = _mm_slli_epi32(a4, 7);
  |  |   55|  7.35k|        b4    = _mm_srli_epi32(b4, 25);
  |  |   56|  7.35k|        diag1 = _mm_xor_si128(diag1, a4);
  |  |   57|       |
  |  |   58|  7.35k|        diag1 = _mm_xor_si128(diag1, b4);
  |  |   59|       |
  |  |   60|  7.35k|        a5    = _mm_add_epi32(a5, diag1);
  |  |   61|  7.35k|        a6    = diag1;
  |  |   62|  7.35k|        b5    = a5;
  |  |   63|  7.35k|        a5    = _mm_slli_epi32(a5, 9);
  |  |   64|  7.35k|        b5    = _mm_srli_epi32(b5, 23);
  |  |   65|  7.35k|        diag2 = _mm_xor_si128(diag2, a5);
  |  |   66|  7.35k|        diag1 = _mm_shuffle_epi32(diag1, 0x93);
  |  |   67|  7.35k|        diag2 = _mm_xor_si128(diag2, b5);
  |  |   68|       |
  |  |   69|  7.35k|        a6    = _mm_add_epi32(a6, diag2);
  |  |   70|  7.35k|        a7    = diag2;
  |  |   71|  7.35k|        b6    = a6;
  |  |   72|  7.35k|        a6    = _mm_slli_epi32(a6, 13);
  |  |   73|  7.35k|        b6    = _mm_srli_epi32(b6, 19);
  |  |   74|  7.35k|        diag3 = _mm_xor_si128(diag3, a6);
  |  |   75|  7.35k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |   76|  7.35k|        diag3 = _mm_xor_si128(diag3, b6);
  |  |   77|       |
  |  |   78|  7.35k|        a7    = _mm_add_epi32(a7, diag3);
  |  |   79|  7.35k|        a0    = diag1;
  |  |   80|  7.35k|        b7    = a7;
  |  |   81|  7.35k|        a7    = _mm_slli_epi32(a7, 18);
  |  |   82|  7.35k|        b7    = _mm_srli_epi32(b7, 14);
  |  |   83|  7.35k|        diag0 = _mm_xor_si128(diag0, a7);
  |  |   84|  7.35k|        diag3 = _mm_shuffle_epi32(diag3, 0x39);
  |  |   85|  7.35k|        diag0 = _mm_xor_si128(diag0, b7);
  |  |   86|       |
  |  |   87|  7.35k|        a0    = _mm_add_epi32(a0, diag0);
  |  |   88|  7.35k|        a1    = diag0;
  |  |   89|  7.35k|        b0    = a0;
  |  |   90|  7.35k|        a0    = _mm_slli_epi32(a0, 7);
  |  |   91|  7.35k|        b0    = _mm_srli_epi32(b0, 25);
  |  |   92|  7.35k|        diag3 = _mm_xor_si128(diag3, a0);
  |  |   93|       |
  |  |   94|  7.35k|        diag3 = _mm_xor_si128(diag3, b0);
  |  |   95|       |
  |  |   96|  7.35k|        a1    = _mm_add_epi32(a1, diag3);
  |  |   97|  7.35k|        a2    = diag3;
  |  |   98|  7.35k|        b1    = a1;
  |  |   99|  7.35k|        a1    = _mm_slli_epi32(a1, 9);
  |  |  100|  7.35k|        b1    = _mm_srli_epi32(b1, 23);
  |  |  101|  7.35k|        diag2 = _mm_xor_si128(diag2, a1);
  |  |  102|  7.35k|        diag3 = _mm_shuffle_epi32(diag3, 0x93);
  |  |  103|  7.35k|        diag2 = _mm_xor_si128(diag2, b1);
  |  |  104|       |
  |  |  105|  7.35k|        a2    = _mm_add_epi32(a2, diag2);
  |  |  106|  7.35k|        a3    = diag2;
  |  |  107|  7.35k|        b2    = a2;
  |  |  108|  7.35k|        a2    = _mm_slli_epi32(a2, 13);
  |  |  109|  7.35k|        b2    = _mm_srli_epi32(b2, 19);
  |  |  110|  7.35k|        diag1 = _mm_xor_si128(diag1, a2);
  |  |  111|  7.35k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |  112|  7.35k|        diag1 = _mm_xor_si128(diag1, b2);
  |  |  113|       |
  |  |  114|  7.35k|        a3    = _mm_add_epi32(a3, diag1);
  |  |  115|  7.35k|        a4    = diag3;
  |  |  116|  7.35k|        b3    = a3;
  |  |  117|  7.35k|        a3    = _mm_slli_epi32(a3, 18);
  |  |  118|  7.35k|        b3    = _mm_srli_epi32(b3, 14);
  |  |  119|  7.35k|        diag0 = _mm_xor_si128(diag0, a3);
  |  |  120|  7.35k|        diag1 = _mm_shuffle_epi32(diag1, 0x39);
  |  |  121|  7.35k|        diag0 = _mm_xor_si128(diag0, b3);
  |  |  122|       |
  |  |  123|  7.35k|        a4    = _mm_add_epi32(a4, diag0);
  |  |  124|  7.35k|        a5    = diag0;
  |  |  125|  7.35k|        b4    = a4;
  |  |  126|  7.35k|        a4    = _mm_slli_epi32(a4, 7);
  |  |  127|  7.35k|        b4    = _mm_srli_epi32(b4, 25);
  |  |  128|  7.35k|        diag1 = _mm_xor_si128(diag1, a4);
  |  |  129|       |
  |  |  130|  7.35k|        diag1 = _mm_xor_si128(diag1, b4);
  |  |  131|       |
  |  |  132|  7.35k|        a5    = _mm_add_epi32(a5, diag1);
  |  |  133|  7.35k|        a6    = diag1;
  |  |  134|  7.35k|        b5    = a5;
  |  |  135|  7.35k|        a5    = _mm_slli_epi32(a5, 9);
  |  |  136|  7.35k|        b5    = _mm_srli_epi32(b5, 23);
  |  |  137|  7.35k|        diag2 = _mm_xor_si128(diag2, a5);
  |  |  138|  7.35k|        diag1 = _mm_shuffle_epi32(diag1, 0x93);
  |  |  139|  7.35k|        diag2 = _mm_xor_si128(diag2, b5);
  |  |  140|       |
  |  |  141|  7.35k|        a6    = _mm_add_epi32(a6, diag2);
  |  |  142|  7.35k|        a7    = diag2;
  |  |  143|  7.35k|        b6    = a6;
  |  |  144|  7.35k|        a6    = _mm_slli_epi32(a6, 13);
  |  |  145|  7.35k|        b6    = _mm_srli_epi32(b6, 19);
  |  |  146|  7.35k|        diag3 = _mm_xor_si128(diag3, a6);
  |  |  147|  7.35k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |  148|  7.35k|        diag3 = _mm_xor_si128(diag3, b6);
  |  |  149|       |
  |  |  150|  7.35k|        a7    = _mm_add_epi32(a7, diag3);
  |  |  151|  7.35k|        a0    = diag1;
  |  |  152|  7.35k|        b7    = a7;
  |  |  153|  7.35k|        a7    = _mm_slli_epi32(a7, 18);
  |  |  154|  7.35k|        b7    = _mm_srli_epi32(b7, 14);
  |  |  155|  7.35k|        diag0 = _mm_xor_si128(diag0, a7);
  |  |  156|  7.35k|        diag3 = _mm_shuffle_epi32(diag3, 0x39);
  |  |  157|  7.35k|        diag0 = _mm_xor_si128(diag0, b7);
  |  |  158|  7.35k|    }
  |  |  159|       |
  |  |  160|  1.47k|    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
  |  |  161|  1.47k|    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
  |  |  162|  1.47k|    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
  |  |  163|  1.47k|    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
  |  |  164|       |
  |  |  165|  1.47k|#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
  |  |  166|  1.47k|    do {                                                 \
  |  |  167|  1.47k|        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
  |  |  168|  1.47k|        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
  |  |  169|  1.47k|        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
  |  |  170|  1.47k|        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
  |  |  171|  1.47k|        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  172|  1.47k|        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  173|  1.47k|        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  174|  1.47k|        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  175|  1.47k|        in##A ^= *(const uint32_t *) (m + (A * 4));      \
  |  |  176|  1.47k|        in##B ^= *(const uint32_t *) (m + (B * 4));      \
  |  |  177|  1.47k|        in##C ^= *(const uint32_t *) (m + (C * 4));      \
  |  |  178|  1.47k|        in##D ^= *(const uint32_t *) (m + (D * 4));      \
  |  |  179|  1.47k|        *(uint32_t *) (c + (A * 4)) = in##A;             \
  |  |  180|  1.47k|        *(uint32_t *) (c + (B * 4)) = in##B;             \
  |  |  181|  1.47k|        *(uint32_t *) (c + (C * 4)) = in##C;             \
  |  |  182|  1.47k|        *(uint32_t *) (c + (D * 4)) = in##D;             \
  |  |  183|  1.47k|    } while (0)
  |  |  184|       |
  |  |  185|  1.47k|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  186|       |
  |  |  187|  1.47k|    ONEQUAD(0, 12, 8, 4);
  |  |  ------------------
  |  |  |  |  185|  1.47k|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  166|  1.47k|    do {                                                 \
  |  |  |  |  |  |  167|  1.47k|        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  168|  1.47k|        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  169|  1.47k|        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  170|  1.47k|        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  171|  1.47k|        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  172|  1.47k|        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  173|  1.47k|        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  174|  1.47k|        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  175|  1.47k|        in##A ^= *(const uint32_t *) (m + (A * 4));      \
  |  |  |  |  |  |  176|  1.47k|        in##B ^= *(const uint32_t *) (m + (B * 4));      \
  |  |  |  |  |  |  177|  1.47k|        in##C ^= *(const uint32_t *) (m + (C * 4));      \
  |  |  |  |  |  |  178|  1.47k|        in##D ^= *(const uint32_t *) (m + (D * 4));      \
  |  |  |  |  |  |  179|  1.47k|        *(uint32_t *) (c + (A * 4)) = in##A;             \
  |  |  |  |  |  |  180|  1.47k|        *(uint32_t *) (c + (B * 4)) = in##B;             \
  |  |  |  |  |  |  181|  1.47k|        *(uint32_t *) (c + (C * 4)) = in##C;             \
  |  |  |  |  |  |  182|  1.47k|        *(uint32_t *) (c + (D * 4)) = in##D;             \
  |  |  |  |  |  |  183|  1.47k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (183:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  188|  1.47k|    ONEQUAD(5, 1, 13, 9);
  |  |  ------------------
  |  |  |  |  185|  1.47k|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  166|  1.47k|    do {                                                 \
  |  |  |  |  |  |  167|  1.47k|        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  168|  1.47k|        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  169|  1.47k|        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  170|  1.47k|        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  171|  1.47k|        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  172|  1.47k|        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  173|  1.47k|        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  174|  1.47k|        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  175|  1.47k|        in##A ^= *(const uint32_t *) (m + (A * 4));      \
  |  |  |  |  |  |  176|  1.47k|        in##B ^= *(const uint32_t *) (m + (B * 4));      \
  |  |  |  |  |  |  177|  1.47k|        in##C ^= *(const uint32_t *) (m + (C * 4));      \
  |  |  |  |  |  |  178|  1.47k|        in##D ^= *(const uint32_t *) (m + (D * 4));      \
  |  |  |  |  |  |  179|  1.47k|        *(uint32_t *) (c + (A * 4)) = in##A;             \
  |  |  |  |  |  |  180|  1.47k|        *(uint32_t *) (c + (B * 4)) = in##B;             \
  |  |  |  |  |  |  181|  1.47k|        *(uint32_t *) (c + (C * 4)) = in##C;             \
  |  |  |  |  |  |  182|  1.47k|        *(uint32_t *) (c + (D * 4)) = in##D;             \
  |  |  |  |  |  |  183|  1.47k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (183:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  189|  1.47k|    ONEQUAD(10, 6, 2, 14);
  |  |  ------------------
  |  |  |  |  185|  1.47k|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  166|  1.47k|    do {                                                 \
  |  |  |  |  |  |  167|  1.47k|        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  168|  1.47k|        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  169|  1.47k|        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  170|  1.47k|        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  171|  1.47k|        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  172|  1.47k|        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  173|  1.47k|        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  174|  1.47k|        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  175|  1.47k|        in##A ^= *(const uint32_t *) (m + (A * 4));      \
  |  |  |  |  |  |  176|  1.47k|        in##B ^= *(const uint32_t *) (m + (B * 4));      \
  |  |  |  |  |  |  177|  1.47k|        in##C ^= *(const uint32_t *) (m + (C * 4));      \
  |  |  |  |  |  |  178|  1.47k|        in##D ^= *(const uint32_t *) (m + (D * 4));      \
  |  |  |  |  |  |  179|  1.47k|        *(uint32_t *) (c + (A * 4)) = in##A;             \
  |  |  |  |  |  |  180|  1.47k|        *(uint32_t *) (c + (B * 4)) = in##B;             \
  |  |  |  |  |  |  181|  1.47k|        *(uint32_t *) (c + (C * 4)) = in##C;             \
  |  |  |  |  |  |  182|  1.47k|        *(uint32_t *) (c + (D * 4)) = in##D;             \
  |  |  |  |  |  |  183|  1.47k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (183:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  190|  1.47k|    ONEQUAD(15, 11, 7, 3);
  |  |  ------------------
  |  |  |  |  185|  1.47k|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  166|  1.47k|    do {                                                 \
  |  |  |  |  |  |  167|  1.47k|        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  168|  1.47k|        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  169|  1.47k|        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  170|  1.47k|        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  171|  1.47k|        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  172|  1.47k|        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  173|  1.47k|        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  174|  1.47k|        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  175|  1.47k|        in##A ^= *(const uint32_t *) (m + (A * 4));      \
  |  |  |  |  |  |  176|  1.47k|        in##B ^= *(const uint32_t *) (m + (B * 4));      \
  |  |  |  |  |  |  177|  1.47k|        in##C ^= *(const uint32_t *) (m + (C * 4));      \
  |  |  |  |  |  |  178|  1.47k|        in##D ^= *(const uint32_t *) (m + (D * 4));      \
  |  |  |  |  |  |  179|  1.47k|        *(uint32_t *) (c + (A * 4)) = in##A;             \
  |  |  |  |  |  |  180|  1.47k|        *(uint32_t *) (c + (B * 4)) = in##B;             \
  |  |  |  |  |  |  181|  1.47k|        *(uint32_t *) (c + (C * 4)) = in##C;             \
  |  |  |  |  |  |  182|  1.47k|        *(uint32_t *) (c + (D * 4)) = in##D;             \
  |  |  |  |  |  |  183|  1.47k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (183:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  191|       |
  |  |  192|  1.47k|#undef ONEQUAD
  |  |  193|  1.47k|#undef ONEQUAD_SHUFFLE
  |  |  194|       |
  |  |  195|  1.47k|    in8 = x[8];
  |  |  196|  1.47k|    in9 = x[13];
  |  |  197|  1.47k|    in8++;
  |  |  198|  1.47k|    if (in8 == 0) {
  |  |  ------------------
  |  |  |  Branch (198:9): [True: 0, False: 1.47k]
  |  |  ------------------
  |  |  199|      0|        in9++;
  |  |  200|      0|    }
  |  |  201|  1.47k|    x[8]  = in8;
  |  |  202|  1.47k|    x[13] = in9;
  |  |  203|       |
  |  |  204|  1.47k|    c += 64;
  |  |  205|  1.47k|    m += 64;
  |  |  206|  1.47k|    bytes -= 64;
  |  |  207|  1.47k|}
  ------------------
   77|  1.43k|#include "u0.h"
  ------------------
  |  |    1|  1.43k|if (bytes > 0) {
  |  |  ------------------
  |  |  |  Branch (1:5): [True: 832, False: 600]
  |  |  ------------------
  |  |    2|    832|    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
  |  |    3|    832|    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
  |  |    4|    832|    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
  |  |    5|    832|    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
  |  |    6|    832|    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
  |  |    7|    832|    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
  |  |    8|    832|    uint8_t partialblock[64] = { 0 };
  |  |    9|       |
  |  |   10|    832|    unsigned int i;
  |  |   11|       |
  |  |   12|    832|    a0 = diag1;
  |  |   13|  4.99k|    for (i = 0; i < ROUNDS; i += 4) {
  |  |  ------------------
  |  |  |  |   28|  4.99k|# define ROUNDS 20
  |  |  ------------------
  |  |  |  Branch (13:17): [True: 4.16k, False: 832]
  |  |  ------------------
  |  |   14|  4.16k|        a0    = _mm_add_epi32(a0, diag0);
  |  |   15|  4.16k|        a1    = diag0;
  |  |   16|  4.16k|        b0    = a0;
  |  |   17|  4.16k|        a0    = _mm_slli_epi32(a0, 7);
  |  |   18|  4.16k|        b0    = _mm_srli_epi32(b0, 25);
  |  |   19|  4.16k|        diag3 = _mm_xor_si128(diag3, a0);
  |  |   20|       |
  |  |   21|  4.16k|        diag3 = _mm_xor_si128(diag3, b0);
  |  |   22|       |
  |  |   23|  4.16k|        a1    = _mm_add_epi32(a1, diag3);
  |  |   24|  4.16k|        a2    = diag3;
  |  |   25|  4.16k|        b1    = a1;
  |  |   26|  4.16k|        a1    = _mm_slli_epi32(a1, 9);
  |  |   27|  4.16k|        b1    = _mm_srli_epi32(b1, 23);
  |  |   28|  4.16k|        diag2 = _mm_xor_si128(diag2, a1);
  |  |   29|  4.16k|        diag3 = _mm_shuffle_epi32(diag3, 0x93);
  |  |   30|  4.16k|        diag2 = _mm_xor_si128(diag2, b1);
  |  |   31|       |
  |  |   32|  4.16k|        a2    = _mm_add_epi32(a2, diag2);
  |  |   33|  4.16k|        a3    = diag2;
  |  |   34|  4.16k|        b2    = a2;
  |  |   35|  4.16k|        a2    = _mm_slli_epi32(a2, 13);
  |  |   36|  4.16k|        b2    = _mm_srli_epi32(b2, 19);
  |  |   37|  4.16k|        diag1 = _mm_xor_si128(diag1, a2);
  |  |   38|  4.16k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |   39|  4.16k|        diag1 = _mm_xor_si128(diag1, b2);
  |  |   40|       |
  |  |   41|  4.16k|        a3    = _mm_add_epi32(a3, diag1);
  |  |   42|  4.16k|        a4    = diag3;
  |  |   43|  4.16k|        b3    = a3;
  |  |   44|  4.16k|        a3    = _mm_slli_epi32(a3, 18);
  |  |   45|  4.16k|        b3    = _mm_srli_epi32(b3, 14);
  |  |   46|  4.16k|        diag0 = _mm_xor_si128(diag0, a3);
  |  |   47|  4.16k|        diag1 = _mm_shuffle_epi32(diag1, 0x39);
  |  |   48|  4.16k|        diag0 = _mm_xor_si128(diag0, b3);
  |  |   49|       |
  |  |   50|  4.16k|        a4    = _mm_add_epi32(a4, diag0);
  |  |   51|  4.16k|        a5    = diag0;
  |  |   52|  4.16k|        b4    = a4;
  |  |   53|  4.16k|        a4    = _mm_slli_epi32(a4, 7);
  |  |   54|  4.16k|        b4    = _mm_srli_epi32(b4, 25);
  |  |   55|  4.16k|        diag1 = _mm_xor_si128(diag1, a4);
  |  |   56|       |
  |  |   57|  4.16k|        diag1 = _mm_xor_si128(diag1, b4);
  |  |   58|       |
  |  |   59|  4.16k|        a5    = _mm_add_epi32(a5, diag1);
  |  |   60|  4.16k|        a6    = diag1;
  |  |   61|  4.16k|        b5    = a5;
  |  |   62|  4.16k|        a5    = _mm_slli_epi32(a5, 9);
  |  |   63|  4.16k|        b5    = _mm_srli_epi32(b5, 23);
  |  |   64|  4.16k|        diag2 = _mm_xor_si128(diag2, a5);
  |  |   65|  4.16k|        diag1 = _mm_shuffle_epi32(diag1, 0x93);
  |  |   66|  4.16k|        diag2 = _mm_xor_si128(diag2, b5);
  |  |   67|       |
  |  |   68|  4.16k|        a6    = _mm_add_epi32(a6, diag2);
  |  |   69|  4.16k|        a7    = diag2;
  |  |   70|  4.16k|        b6    = a6;
  |  |   71|  4.16k|        a6    = _mm_slli_epi32(a6, 13);
  |  |   72|  4.16k|        b6    = _mm_srli_epi32(b6, 19);
  |  |   73|  4.16k|        diag3 = _mm_xor_si128(diag3, a6);
  |  |   74|  4.16k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |   75|  4.16k|        diag3 = _mm_xor_si128(diag3, b6);
  |  |   76|       |
  |  |   77|  4.16k|        a7    = _mm_add_epi32(a7, diag3);
  |  |   78|  4.16k|        a0    = diag1;
  |  |   79|  4.16k|        b7    = a7;
  |  |   80|  4.16k|        a7    = _mm_slli_epi32(a7, 18);
  |  |   81|  4.16k|        b7    = _mm_srli_epi32(b7, 14);
  |  |   82|  4.16k|        diag0 = _mm_xor_si128(diag0, a7);
  |  |   83|  4.16k|        diag3 = _mm_shuffle_epi32(diag3, 0x39);
  |  |   84|  4.16k|        diag0 = _mm_xor_si128(diag0, b7);
  |  |   85|       |
  |  |   86|  4.16k|        a0    = _mm_add_epi32(a0, diag0);
  |  |   87|  4.16k|        a1    = diag0;
  |  |   88|  4.16k|        b0    = a0;
  |  |   89|  4.16k|        a0    = _mm_slli_epi32(a0, 7);
  |  |   90|  4.16k|        b0    = _mm_srli_epi32(b0, 25);
  |  |   91|  4.16k|        diag3 = _mm_xor_si128(diag3, a0);
  |  |   92|       |
  |  |   93|  4.16k|        diag3 = _mm_xor_si128(diag3, b0);
  |  |   94|       |
  |  |   95|  4.16k|        a1    = _mm_add_epi32(a1, diag3);
  |  |   96|  4.16k|        a2    = diag3;
  |  |   97|  4.16k|        b1    = a1;
  |  |   98|  4.16k|        a1    = _mm_slli_epi32(a1, 9);
  |  |   99|  4.16k|        b1    = _mm_srli_epi32(b1, 23);
  |  |  100|  4.16k|        diag2 = _mm_xor_si128(diag2, a1);
  |  |  101|  4.16k|        diag3 = _mm_shuffle_epi32(diag3, 0x93);
  |  |  102|  4.16k|        diag2 = _mm_xor_si128(diag2, b1);
  |  |  103|       |
  |  |  104|  4.16k|        a2    = _mm_add_epi32(a2, diag2);
  |  |  105|  4.16k|        a3    = diag2;
  |  |  106|  4.16k|        b2    = a2;
  |  |  107|  4.16k|        a2    = _mm_slli_epi32(a2, 13);
  |  |  108|  4.16k|        b2    = _mm_srli_epi32(b2, 19);
  |  |  109|  4.16k|        diag1 = _mm_xor_si128(diag1, a2);
  |  |  110|  4.16k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |  111|  4.16k|        diag1 = _mm_xor_si128(diag1, b2);
  |  |  112|       |
  |  |  113|  4.16k|        a3    = _mm_add_epi32(a3, diag1);
  |  |  114|  4.16k|        a4    = diag3;
  |  |  115|  4.16k|        b3    = a3;
  |  |  116|  4.16k|        a3    = _mm_slli_epi32(a3, 18);
  |  |  117|  4.16k|        b3    = _mm_srli_epi32(b3, 14);
  |  |  118|  4.16k|        diag0 = _mm_xor_si128(diag0, a3);
  |  |  119|  4.16k|        diag1 = _mm_shuffle_epi32(diag1, 0x39);
  |  |  120|  4.16k|        diag0 = _mm_xor_si128(diag0, b3);
  |  |  121|       |
  |  |  122|  4.16k|        a4    = _mm_add_epi32(a4, diag0);
  |  |  123|  4.16k|        a5    = diag0;
  |  |  124|  4.16k|        b4    = a4;
  |  |  125|  4.16k|        a4    = _mm_slli_epi32(a4, 7);
  |  |  126|  4.16k|        b4    = _mm_srli_epi32(b4, 25);
  |  |  127|  4.16k|        diag1 = _mm_xor_si128(diag1, a4);
  |  |  128|       |
  |  |  129|  4.16k|        diag1 = _mm_xor_si128(diag1, b4);
  |  |  130|       |
  |  |  131|  4.16k|        a5    = _mm_add_epi32(a5, diag1);
  |  |  132|  4.16k|        a6    = diag1;
  |  |  133|  4.16k|        b5    = a5;
  |  |  134|  4.16k|        a5    = _mm_slli_epi32(a5, 9);
  |  |  135|  4.16k|        b5    = _mm_srli_epi32(b5, 23);
  |  |  136|  4.16k|        diag2 = _mm_xor_si128(diag2, a5);
  |  |  137|  4.16k|        diag1 = _mm_shuffle_epi32(diag1, 0x93);
  |  |  138|  4.16k|        diag2 = _mm_xor_si128(diag2, b5);
  |  |  139|       |
  |  |  140|  4.16k|        a6    = _mm_add_epi32(a6, diag2);
  |  |  141|  4.16k|        a7    = diag2;
  |  |  142|  4.16k|        b6    = a6;
  |  |  143|  4.16k|        a6    = _mm_slli_epi32(a6, 13);
  |  |  144|  4.16k|        b6    = _mm_srli_epi32(b6, 19);
  |  |  145|  4.16k|        diag3 = _mm_xor_si128(diag3, a6);
  |  |  146|  4.16k|        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
  |  |  147|  4.16k|        diag3 = _mm_xor_si128(diag3, b6);
  |  |  148|       |
  |  |  149|  4.16k|        a7    = _mm_add_epi32(a7, diag3);
  |  |  150|  4.16k|        a0    = diag1;
  |  |  151|  4.16k|        b7    = a7;
  |  |  152|  4.16k|        a7    = _mm_slli_epi32(a7, 18);
  |  |  153|  4.16k|        b7    = _mm_srli_epi32(b7, 14);
  |  |  154|  4.16k|        diag0 = _mm_xor_si128(diag0, a7);
  |  |  155|  4.16k|        diag3 = _mm_shuffle_epi32(diag3, 0x39);
  |  |  156|  4.16k|        diag0 = _mm_xor_si128(diag0, b7);
  |  |  157|  4.16k|    }
  |  |  158|       |
  |  |  159|    832|    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
  |  |  160|    832|    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
  |  |  161|    832|    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
  |  |  162|    832|    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
  |  |  163|       |
  |  |  164|    832|#define ONEQUAD_SHUFFLE(A, B, C, D)                                              \
  |  |  165|    832|    do {                                                                         \
  |  |  166|    832|        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
  |  |  167|    832|        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
  |  |  168|    832|        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
  |  |  169|    832|        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
  |  |  170|    832|        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  171|    832|        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  172|    832|        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  173|    832|        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  174|    832|        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
  |  |  175|    832|        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
  |  |  176|    832|        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
  |  |  177|    832|        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
  |  |  178|    832|    } while (0)
  |  |  179|       |
  |  |  180|    832|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  181|       |
  |  |  182|    832|    ONEQUAD(0, 12, 8, 4);
  |  |  ------------------
  |  |  |  |  180|    832|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  165|    832|    do {                                                                         \
  |  |  |  |  |  |  166|    832|        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  167|    832|        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  168|    832|        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  169|    832|        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  170|    832|        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  171|    832|        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  172|    832|        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  173|    832|        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  174|    832|        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
  |  |  |  |  |  |  175|    832|        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
  |  |  |  |  |  |  176|    832|        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
  |  |  |  |  |  |  177|    832|        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
  |  |  |  |  |  |  178|    832|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (178:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  183|    832|    ONEQUAD(5, 1, 13, 9);
  |  |  ------------------
  |  |  |  |  180|    832|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  165|    832|    do {                                                                         \
  |  |  |  |  |  |  166|    832|        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  167|    832|        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  168|    832|        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  169|    832|        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  170|    832|        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  171|    832|        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  172|    832|        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  173|    832|        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  174|    832|        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
  |  |  |  |  |  |  175|    832|        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
  |  |  |  |  |  |  176|    832|        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
  |  |  |  |  |  |  177|    832|        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
  |  |  |  |  |  |  178|    832|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (178:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  184|    832|    ONEQUAD(10, 6, 2, 14);
  |  |  ------------------
  |  |  |  |  180|    832|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  165|    832|    do {                                                                         \
  |  |  |  |  |  |  166|    832|        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  167|    832|        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  168|    832|        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  169|    832|        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  170|    832|        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  171|    832|        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  172|    832|        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  173|    832|        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  174|    832|        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
  |  |  |  |  |  |  175|    832|        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
  |  |  |  |  |  |  176|    832|        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
  |  |  |  |  |  |  177|    832|        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
  |  |  |  |  |  |  178|    832|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (178:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  185|    832|    ONEQUAD(15, 11, 7, 3);
  |  |  ------------------
  |  |  |  |  180|    832|#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
  |  |  |  |  ------------------
  |  |  |  |  |  |  165|    832|    do {                                                                         \
  |  |  |  |  |  |  166|    832|        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
  |  |  |  |  |  |  167|    832|        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
  |  |  |  |  |  |  168|    832|        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
  |  |  |  |  |  |  169|    832|        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
  |  |  |  |  |  |  170|    832|        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
  |  |  |  |  |  |  171|    832|        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
  |  |  |  |  |  |  172|    832|        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
  |  |  |  |  |  |  173|    832|        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
  |  |  |  |  |  |  174|    832|        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
  |  |  |  |  |  |  175|    832|        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
  |  |  |  |  |  |  176|    832|        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
  |  |  |  |  |  |  177|    832|        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
  |  |  |  |  |  |  178|    832|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (178:14): [Folded - Ignored]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  186|       |
  |  |  187|    832|#undef ONEQUAD
  |  |  188|    832|#undef ONEQUAD_SHUFFLE
  |  |  189|       |
  |  |  190|  30.6k|    for (i = 0; i < bytes; i++) {
  |  |  ------------------
  |  |  |  Branch (190:17): [True: 29.8k, False: 832]
  |  |  ------------------
  |  |  191|  29.8k|        c[i] = m[i] ^ partialblock[i];
  |  |  192|  29.8k|    }
  |  |  193|       |
  |  |  194|    832|    sodium_memzero(partialblock, sizeof partialblock);
  |  |  195|    832|}
  ------------------
   78|  1.43k|}
salsa20_xmm6int-avx2.c:stream_avx2_xor_ic:
  103|  1.11k|{
  104|  1.11k|    struct salsa_ctx ctx;
  105|  1.11k|    uint8_t          ic_bytes[8];
  106|  1.11k|    uint32_t         ic_high;
  107|  1.11k|    uint32_t         ic_low;
  108|       |
  109|  1.11k|    if (!mlen) {
  ------------------
  |  Branch (109:9): [True: 0, False: 1.11k]
  ------------------
  110|      0|        return 0;
  111|      0|    }
  112|  1.11k|    ic_high = (uint32_t) (ic >> 32);
  113|  1.11k|    ic_low  = (uint32_t) ic;
  114|  1.11k|    STORE32_LE(&ic_bytes[0], ic_low);
  ------------------
  |  |  128|  1.11k|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
  115|  1.11k|    STORE32_LE(&ic_bytes[4], ic_high);
  ------------------
  |  |  128|  1.11k|#define STORE32_LE(DST, W) store32_le((DST), (W))
  ------------------
  116|  1.11k|    salsa_keysetup(&ctx, k);
  117|  1.11k|    salsa_ivsetup(&ctx, n, ic_bytes);
  118|  1.11k|    salsa20_encrypt_bytes(&ctx, m, c, mlen);
  119|  1.11k|    sodium_memzero(&ctx, sizeof ctx);
  120|       |
  121|  1.11k|    return 0;
  122|  1.11k|}

crypto_verify_16:
   89|    316|{
   90|    316|    return crypto_verify_n(x, y, crypto_verify_16_BYTES);
  ------------------
  |  |   11|    316|#define crypto_verify_16_BYTES 16U
  ------------------
   91|    316|}
verify.c:crypto_verify_n:
   34|    316|{
   35|    316|    const    __m128i zero = _mm_setzero_si128();
   36|    316|    volatile __m128i v1, v2, z;
   37|    316|    volatile int     m;
   38|    316|    int              i;
   39|       |
   40|    316|    const volatile __m128i *volatile x =
   41|    316|        (const volatile __m128i *volatile) (const void *) x_;
   42|    316|    const volatile __m128i *volatile y =
   43|    316|        (const volatile __m128i *volatile) (const void *) y_;
   44|    316|    v1 = _mm_loadu_si128((const __m128i *) &x[0]);
   45|    316|    v2 = _mm_loadu_si128((const __m128i *) &y[0]);
   46|    316|    z = _mm_xor_si128(v1, v2);
   47|    316|    for (i = 1; i < n / 16; i++) {
  ------------------
  |  Branch (47:17): [True: 0, False: 316]
  ------------------
   48|      0|        v1 = _mm_loadu_si128((const __m128i *) &x[i]);
   49|      0|        v2 = _mm_loadu_si128((const __m128i *) &y[i]);
   50|      0|        z = _mm_or_si128(z, _mm_xor_si128(v1, v2));
   51|      0|    }
   52|    316|    m = _mm_movemask_epi8(_mm_cmpeq_epi32(z, zero));
   53|    316|    v1 = zero; v2 = zero; z = zero;
   54|       |
   55|    316|    return (int) (((uint32_t) m + 1U) >> 16) - 1;
   56|    316|}

salsa20_xmm6int-avx2.c:load32_le:
  114|  16.5k|{
  115|  16.5k|#ifdef NATIVE_LITTLE_ENDIAN
  116|  16.5k|    uint32_t w;
  117|  16.5k|    memcpy(&w, src, sizeof w);
  118|  16.5k|    return w;
  119|       |#else
  120|       |    uint32_t w = (uint32_t) src[0];
  121|       |    w |= (uint32_t) src[1] <<  8;
  122|       |    w |= (uint32_t) src[2] << 16;
  123|       |    w |= (uint32_t) src[3] << 24;
  124|       |    return w;
  125|       |#endif
  126|  16.5k|}
salsa20_xmm6int-avx2.c:store32_le:
  131|  2.23k|{
  132|  2.23k|#ifdef NATIVE_LITTLE_ENDIAN
  133|  2.23k|    memcpy(dst, &w, sizeof w);
  134|       |#else
  135|       |    dst[0] = (uint8_t) w; w >>= 8;
  136|       |    dst[1] = (uint8_t) w; w >>= 8;
  137|       |    dst[2] = (uint8_t) w; w >>= 8;
  138|       |    dst[3] = (uint8_t) w;
  139|       |#endif
  140|  2.23k|}
core_hsalsa20_ref2.c:load32_le:
  114|  7.58k|{
  115|  7.58k|#ifdef NATIVE_LITTLE_ENDIAN
  116|  7.58k|    uint32_t w;
  117|  7.58k|    memcpy(&w, src, sizeof w);
  118|  7.58k|    return w;
  119|       |#else
  120|       |    uint32_t w = (uint32_t) src[0];
  121|       |    w |= (uint32_t) src[1] <<  8;
  122|       |    w |= (uint32_t) src[2] << 16;
  123|       |    w |= (uint32_t) src[3] << 24;
  124|       |    return w;
  125|       |#endif
  126|  7.58k|}
core_hsalsa20_ref2.c:rotl32:
   45|   202k|{
   46|   202k|    return (x << b) | (x >> (32 - b));
   47|   202k|}
core_hsalsa20_ref2.c:store32_le:
  131|  5.05k|{
  132|  5.05k|#ifdef NATIVE_LITTLE_ENDIAN
  133|  5.05k|    memcpy(dst, &w, sizeof w);
  134|       |#else
  135|       |    dst[0] = (uint8_t) w; w >>= 8;
  136|       |    dst[1] = (uint8_t) w; w >>= 8;
  137|       |    dst[2] = (uint8_t) w; w >>= 8;
  138|       |    dst[3] = (uint8_t) w;
  139|       |#endif
  140|  5.05k|}
chacha20_dolbeau-avx2.c:load32_le:
  114|  6.95k|{
  115|  6.95k|#ifdef NATIVE_LITTLE_ENDIAN
  116|  6.95k|    uint32_t w;
  117|  6.95k|    memcpy(&w, src, sizeof w);
  118|  6.95k|    return w;
  119|       |#else
  120|       |    uint32_t w = (uint32_t) src[0];
  121|       |    w |= (uint32_t) src[1] <<  8;
  122|       |    w |= (uint32_t) src[2] << 16;
  123|       |    w |= (uint32_t) src[3] << 24;
  124|       |    return w;
  125|       |#endif
  126|  6.95k|}

randombytes_set_implementation:
   48|    316|{
   49|    316|    implementation = impl;
   50|       |
   51|    316|    return 0;
   52|    316|}
randombytes_implementation_name:
   56|    316|{
   57|    316|#ifndef __EMSCRIPTEN__
   58|    316|    randombytes_init_if_needed();
   59|    316|    return implementation->implementation_name();
   60|       |#else
   61|       |    return "js";
   62|       |#endif
   63|    316|}
randombytes_stir:
   80|      2|{
   81|      2|#ifndef __EMSCRIPTEN__
   82|      2|    randombytes_init_if_needed();
   83|      2|    if (implementation->stir != NULL) {
  ------------------
  |  Branch (83:9): [True: 2, False: 0]
  ------------------
   84|      2|        implementation->stir();
   85|      2|    }
   86|       |#else
   87|       |    EM_ASM({
   88|       |        if (Module.getRandomValue === undefined) {
   89|       |            try {
   90|       |                var window_ = 'object' === typeof window ? window : self;
   91|       |                var crypto_ = typeof window_.crypto !== 'undefined' ? window_.crypto : window_.msCrypto;
   92|       |                crypto_ = (crypto_ === undefined) ? crypto : crypto_;
   93|       |                var randomValuesStandard = function() {
   94|       |                    var buf = new Uint32Array(1);
   95|       |                    crypto_.getRandomValues(buf);
   96|       |                    return buf[0] >>> 0;
   97|       |                };
   98|       |                randomValuesStandard();
   99|       |                Module.getRandomValue = randomValuesStandard;
  100|       |            } catch (e) {
  101|       |                try {
  102|       |                    var crypto = require('crypto');
  103|       |                    var randomValueNodeJS = function() {
  104|       |                        var buf = crypto['randomBytes'](4);
  105|       |                        return (buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3]) >>> 0;
  106|       |                    };
  107|       |                    randomValueNodeJS();
  108|       |                    Module.getRandomValue = randomValueNodeJS;
  109|       |                } catch (e) {
  110|       |                    throw 'No secure random number generator found';
  111|       |                }
  112|       |            }
  113|       |        }
  114|       |    });
  115|       |#endif
  116|      2|}
randombytes_buf:
  145|    633|{
  146|    633|#ifndef __EMSCRIPTEN__
  147|    633|    randombytes_init_if_needed();
  148|    633|    if (size > (size_t) 0U) {
  ------------------
  |  Branch (148:9): [True: 633, False: 0]
  ------------------
  149|    633|        implementation->buf(buf, size);
  150|    633|    }
  151|       |#else
  152|       |    unsigned char *p = (unsigned char *) buf;
  153|       |    size_t         i;
  154|       |
  155|       |    for (i = (size_t) 0U; i < size; i++) {
  156|       |        p[i] = (unsigned char) randombytes_random();
  157|       |    }
  158|       |#endif
  159|    633|}
randombytes_buf_deterministic:
  164|    632|{
  165|    632|    static const unsigned char nonce[crypto_stream_chacha20_ietf_NONCEBYTES] = {
  166|    632|        'L', 'i', 'b', 's', 'o', 'd', 'i', 'u', 'm', 'D', 'R', 'G'
  167|    632|    };
  168|       |
  169|    632|    COMPILER_ASSERT(randombytes_SEEDBYTES == crypto_stream_chacha20_ietf_KEYBYTES);
  ------------------
  |  |   23|    632|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
  170|    632|#if SIZE_MAX > 0x4000000000ULL
  171|    632|    COMPILER_ASSERT(randombytes_BYTES_MAX <= 0x4000000000ULL);
  ------------------
  |  |   23|    632|#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
  ------------------
  172|    632|    if (size > 0x4000000000ULL) {
  ------------------
  |  Branch (172:9): [True: 0, False: 632]
  ------------------
  173|      0|        sodium_misuse();
  174|      0|    }
  175|    632|#endif
  176|    632|    crypto_stream_chacha20_ietf((unsigned char *) buf, (unsigned long long) size,
  177|    632|                                nonce, seed);
  178|    632|}
randombytes.c:randombytes_init_if_needed:
   39|    951|{
   40|    951|    if (implementation == NULL) {
  ------------------
  |  Branch (40:9): [True: 1, False: 950]
  ------------------
   41|      1|        implementation = RANDOMBYTES_DEFAULT_IMPLEMENTATION;
  ------------------
  |  |   33|      1|#  define RANDOMBYTES_DEFAULT_IMPLEMENTATION &randombytes_sysrandom_implementation;
  ------------------
   42|      1|        randombytes_stir();
   43|      1|    }
   44|    951|}

randombytes_sysrandom.c:randombytes_sysrandom_stir:
  296|      2|{
  297|      2|    if (stream.initialized == 0) {
  ------------------
  |  Branch (297:9): [True: 1, False: 1]
  ------------------
  298|      1|        randombytes_sysrandom_init();
  299|      1|        stream.initialized = 1;
  300|      1|    }
  301|      2|}
randombytes_sysrandom.c:randombytes_sysrandom_init:
  263|      1|{
  264|      1|    const int     errno_save = errno;
  265|       |
  266|      1|#  ifdef HAVE_LINUX_COMPATIBLE_GETRANDOM
  267|      1|    {
  268|      1|        unsigned char fodder[16];
  269|       |
  270|      1|        if (randombytes_linux_getrandom(fodder, sizeof fodder) == 0) {
  ------------------
  |  Branch (270:13): [True: 1, False: 0]
  ------------------
  271|      1|            stream.getrandom_available = 1;
  272|      1|            errno = errno_save;
  273|      1|            return;
  274|      1|        }
  275|      0|        stream.getrandom_available = 0;
  276|      0|    }
  277|      0|#  endif
  278|       |
  279|      0|    if ((stream.random_data_source_fd =
  ------------------
  |  Branch (279:9): [True: 0, False: 0]
  ------------------
  280|      0|         randombytes_sysrandom_random_dev_open()) == -1) {
  281|      0|        sodium_misuse(); /* LCOV_EXCL_LINE */
  282|      0|    }
  283|      0|    errno = errno_save;
  284|      0|}
randombytes_sysrandom.c:randombytes_linux_getrandom:
  241|      2|{
  242|      2|    unsigned char *buf = (unsigned char *) buf_;
  243|      2|    size_t         chunk_size = 256U;
  244|       |
  245|      2|    do {
  246|      2|        if (size < chunk_size) {
  ------------------
  |  Branch (246:13): [True: 2, False: 0]
  ------------------
  247|      2|            chunk_size = size;
  248|      2|            assert(chunk_size > (size_t) 0U);
  249|      2|        }
  250|      2|        if (_randombytes_linux_getrandom(buf, chunk_size) != 0) {
  ------------------
  |  Branch (250:13): [True: 0, False: 2]
  ------------------
  251|      0|            return -1;
  252|      0|        }
  253|      2|        size -= chunk_size;
  254|      2|        buf += chunk_size;
  255|      2|    } while (size > (size_t) 0U);
  ------------------
  |  Branch (255:14): [True: 0, False: 2]
  ------------------
  256|       |
  257|      2|    return 0;
  258|      2|}
randombytes_sysrandom.c:_randombytes_linux_getrandom:
  228|      2|{
  229|      2|    int readnb;
  230|       |
  231|      2|    assert(size <= 256U);
  232|      2|    do {
  233|      2|        readnb = getrandom(buf, size, 0);
  234|      2|    } while (readnb < 0 && (errno == EINTR || errno == EAGAIN));
  ------------------
  |  Branch (234:14): [True: 0, False: 2]
  |  Branch (234:29): [True: 0, False: 0]
  |  Branch (234:47): [True: 0, False: 0]
  ------------------
  235|       |
  236|      2|    return (readnb == (int) size) - 1;
  237|      2|}
randombytes_sysrandom.c:randombytes_sysrandom_buf:
  339|      1|{
  340|      1|    randombytes_sysrandom_stir_if_needed();
  341|      1|# if defined(ULLONG_MAX) && defined(SIZE_MAX)
  342|       |#  if SIZE_MAX > ULLONG_MAX
  343|       |    /* coverity[result_independent_of_operands] */
  344|       |    assert(size <= ULLONG_MAX);
  345|       |#  endif
  346|      1|# endif
  347|      1|# ifndef _WIN32
  348|      1|#  ifdef HAVE_LINUX_COMPATIBLE_GETRANDOM
  349|      1|    if (stream.getrandom_available != 0) {
  ------------------
  |  Branch (349:9): [True: 1, False: 0]
  ------------------
  350|      1|        if (randombytes_linux_getrandom(buf, size) != 0) {
  ------------------
  |  Branch (350:13): [True: 0, False: 1]
  ------------------
  351|      0|            sodium_misuse(); /* LCOV_EXCL_LINE */
  352|      0|        }
  353|      1|        return;
  354|      1|    }
  355|      0|#  endif
  356|      0|    if (stream.random_data_source_fd == -1 ||
  ------------------
  |  Branch (356:9): [True: 0, False: 0]
  ------------------
  357|      0|        safe_read(stream.random_data_source_fd, buf, size) != (ssize_t) size) {
  ------------------
  |  Branch (357:9): [True: 0, False: 0]
  ------------------
  358|      0|        sodium_misuse(); /* LCOV_EXCL_LINE */
  359|      0|    }
  360|       |# else /* _WIN32 */
  361|       |    COMPILER_ASSERT(randombytes_BYTES_MAX <= 0xffffffffUL);
  362|       |    if (size > (size_t) 0xffffffffUL) {
  363|       |        sodium_misuse(); /* LCOV_EXCL_LINE */
  364|       |    }
  365|       |    if (! RtlGenRandom((PVOID) buf, (ULONG) size)) {
  366|       |        sodium_misuse(); /* LCOV_EXCL_LINE */
  367|       |    }
  368|       |# endif /* _WIN32 */
  369|      0|}
randombytes_sysrandom.c:randombytes_sysrandom_stir_if_needed:
  305|      1|{
  306|      1|    if (stream.initialized == 0) {
  ------------------
  |  Branch (306:9): [True: 0, False: 1]
  ------------------
  307|      0|        randombytes_sysrandom_stir();
  308|      0|    }
  309|      1|}

sodium_init:
   29|    632|{
   30|    632|    if (sodium_crit_enter() != 0) {
  ------------------
  |  Branch (30:9): [True: 0, False: 632]
  ------------------
   31|      0|        return -1; /* LCOV_EXCL_LINE */
   32|      0|    }
   33|    632|    if (initialized != 0) {
  ------------------
  |  Branch (33:9): [True: 631, False: 1]
  ------------------
   34|    631|        if (sodium_crit_leave() != 0) {
  ------------------
  |  Branch (34:13): [True: 0, False: 631]
  ------------------
   35|      0|            return -1; /* LCOV_EXCL_LINE */
   36|      0|        }
   37|    631|        return 1;
   38|    631|    }
   39|      1|    _sodium_runtime_get_cpu_features();
   40|      1|    randombytes_stir();
   41|      1|    _sodium_alloc_init();
   42|      1|    _crypto_pwhash_argon2_pick_best_implementation();
   43|      1|    _crypto_generichash_blake2b_pick_best_implementation();
   44|      1|    _crypto_onetimeauth_poly1305_pick_best_implementation();
   45|      1|    _crypto_scalarmult_curve25519_pick_best_implementation();
   46|      1|    _crypto_stream_chacha20_pick_best_implementation();
   47|      1|    _crypto_stream_salsa20_pick_best_implementation();
   48|      1|    _crypto_aead_aegis128l_pick_best_implementation();
   49|      1|    _crypto_aead_aegis256_pick_best_implementation();
   50|      1|    initialized = 1;
   51|      1|    if (sodium_crit_leave() != 0) {
  ------------------
  |  Branch (51:9): [True: 0, False: 1]
  ------------------
   52|      0|        return -1; /* LCOV_EXCL_LINE */
   53|      0|    }
   54|      1|    return 0;
   55|      1|}
sodium_crit_enter:
  117|    632|{
  118|    632|    int ret;
  119|       |
  120|    632|    if ((ret = pthread_mutex_lock(&_sodium_lock)) == 0) {
  ------------------
  |  Branch (120:9): [True: 632, False: 0]
  ------------------
  121|    632|        assert(locked == 0);
  122|    632|        locked = 1;
  123|    632|    }
  124|    632|    return ret;
  125|    632|}
sodium_crit_leave:
  129|    632|{
  130|    632|    if (locked == 0) {
  ------------------
  |  Branch (130:9): [True: 0, False: 632]
  ------------------
  131|       |# ifdef EPERM
  132|       |        errno = EPERM;
  133|       |# endif
  134|      0|        return -1;
  135|      0|    }
  136|    632|    locked = 0;
  137|       |
  138|    632|    return pthread_mutex_unlock(&_sodium_lock);
  139|    632|}

_sodium_runtime_get_cpu_features:
  311|      1|{
  312|      1|    int ret = -1;
  313|       |
  314|      1|    ret &= _sodium_runtime_arm_cpu_features(&_cpu_features);
  315|      1|    ret &= _sodium_runtime_intel_cpu_features(&_cpu_features);
  316|      1|    _cpu_features.initialized = 1;
  317|       |
  318|      1|    return ret;
  319|      1|}
sodium_runtime_has_sse2:
  335|      1|{
  336|      1|    return _cpu_features.has_sse2;
  337|      1|}
sodium_runtime_has_avx:
  359|      3|{
  360|      3|    return _cpu_features.has_avx;
  361|      3|}
sodium_runtime_has_avx2:
  365|      4|{
  366|      4|    return _cpu_features.has_avx2;
  367|      4|}
sodium_runtime_has_avx512f:
  371|      1|{
  372|      1|    return _cpu_features.has_avx512f;
  373|      1|}
sodium_runtime_has_aesni:
  383|      2|{
  384|      2|    return _cpu_features.has_aesni;
  385|      2|}
runtime.c:_sodium_runtime_arm_cpu_features:
   59|      1|{
   60|      1|    cpu_features->has_neon = 0;
   61|      1|    cpu_features->has_armcrypto = 0;
   62|       |
   63|      1|#ifndef __ARM_ARCH
   64|      1|    return -1; /* LCOV_EXCL_LINE */
   65|      0|#endif
   66|       |
   67|       |#if defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
   68|       |    cpu_features->has_neon = 1;
   69|       |#elif defined(HAVE_ANDROID_GETCPUFEATURES)
   70|       |    cpu_features->has_neon =
   71|       |        (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) != 0x0;
   72|       |#elif (defined(__aarch64__) || defined(_M_ARM64)) && defined(AT_HWCAP)
   73|       |# ifdef HAVE_GETAUXVAL
   74|       |    cpu_features->has_neon = (getauxval(AT_HWCAP) & (1L << 1)) != 0;
   75|       |# elif defined(HAVE_ELF_AUX_INFO)
   76|       |    {
   77|       |        unsigned long buf;
   78|       |        if (elf_aux_info(AT_HWCAP, (void *) &buf, (int) sizeof buf) == 0) {
   79|       |            cpu_features->has_neon = (buf & (1L << 1)) != 0;
   80|       |        }
   81|       |    }
   82|       |# endif
   83|       |#elif defined(__arm__) && defined(AT_HWCAP)
   84|       |# ifdef HAVE_GETAUXVAL
   85|       |    cpu_features->has_neon = (getauxval(AT_HWCAP) & (1L << 12)) != 0;
   86|       |# elif defined(HAVE_ELF_AUX_INFO)
   87|       |    {
   88|       |        unsigned long buf;
   89|       |        if (elf_aux_info(AT_HWCAP, (void *) &buf, (int) sizeof buf) == 0) {
   90|       |            cpu_features->has_neon = (buf & (1L << 12)) != 0;
   91|       |        }
   92|       |    }
   93|       |# endif
   94|       |#endif
   95|       |
   96|      0|    if (cpu_features->has_neon == 0) {
  ------------------
  |  Branch (96:9): [True: 0, False: 0]
  ------------------
   97|      0|        return 0;
   98|      0|    }
   99|       |
  100|       |#if defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_AES)
  101|       |    cpu_features->has_armcrypto = 1;
  102|       |#elif defined(_M_ARM64)
  103|       |    cpu_features->has_armcrypto = 1; /* assuming all CPUs supported by ARM Windows have the crypto extensions */
  104|       |#elif defined(__APPLE__) && defined(CPU_TYPE_ARM64) && defined(CPU_SUBTYPE_ARM64E)
  105|       |    {
  106|       |        cpu_type_t    cpu_type;
  107|       |        cpu_subtype_t cpu_subtype;
  108|       |        size_t        cpu_type_len = sizeof cpu_type;
  109|       |        size_t        cpu_subtype_len = sizeof cpu_subtype;
  110|       |
  111|       |        if (sysctlbyname("hw.cputype", &cpu_type, &cpu_type_len,
  112|       |                         NULL, 0) == 0 && cpu_type == CPU_TYPE_ARM64 &&
  113|       |            sysctlbyname("hw.cpusubtype", &cpu_subtype, &cpu_subtype_len,
  114|       |                         NULL, 0) == 0 &&
  115|       |            (cpu_subtype == CPU_SUBTYPE_ARM64E ||
  116|       |                cpu_subtype == CPU_SUBTYPE_ARM64_V8)) {
  117|       |            cpu_features->has_armcrypto = 1;
  118|       |        }
  119|       |    }
  120|       |#elif defined(HAVE_ANDROID_GETCPUFEATURES)
  121|       |    cpu_features->has_armcrypto =
  122|       |        (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_AES) != 0x0;
  123|       |#elif (defined(__aarch64__) || defined(_M_ARM64)) && defined(AT_HWCAP)
  124|       |# ifdef HAVE_GETAUXVAL
  125|       |    cpu_features->has_armcrypto = (getauxval(AT_HWCAP) & (1L << 3)) != 0;
  126|       |# elif defined(HAVE_ELF_AUX_INFO)
  127|       |    {
  128|       |        unsigned long buf;
  129|       |        if (elf_aux_info(AT_HWCAP, (void *) &buf, (int) sizeof buf) == 0) {
  130|       |            cpu_features->has_armcrypto = (buf & (1L << 3)) != 0;
  131|       |        }
  132|       |    }
  133|       |# endif
  134|       |#elif defined(__arm__) && defined(AT_HWCAP2)
  135|       |# ifdef HAVE_GETAUXVAL
  136|       |    cpu_features->has_armcrypto = (getauxval(AT_HWCAP2) & (1L << 0)) != 0;
  137|       |# elif defined(HAVE_ELF_AUX_INFO)
  138|       |    {
  139|       |        unsigned long buf;
  140|       |        if (elf_aux_info(AT_HWCAP2, (void *) &buf, (int) sizeof buf) == 0) {
  141|       |            cpu_features->has_armcrypto = (buf & (1L << 0)) != 0;
  142|       |        }
  143|       |    }
  144|       |# endif
  145|       |#endif
  146|       |
  147|      0|    return 0;
  148|      0|}
runtime.c:_sodium_runtime_intel_cpu_features:
  194|      1|{
  195|      1|    unsigned int cpu_info[4];
  196|      1|    uint32_t     xcr0 = 0U;
  197|       |
  198|      1|    _cpuid(cpu_info, 0x0);
  199|      1|    if (cpu_info[0] == 0U) {
  ------------------
  |  Branch (199:9): [True: 0, False: 1]
  ------------------
  200|      0|        return -1; /* LCOV_EXCL_LINE */
  201|      0|    }
  202|      1|    _cpuid(cpu_info, 0x00000001);
  203|      1|#ifdef HAVE_EMMINTRIN_H
  204|      1|    cpu_features->has_sse2 = ((cpu_info[3] & CPUID_EDX_SSE2) != 0x0);
  ------------------
  |  |   49|      1|#define CPUID_EDX_SSE2    0x04000000
  ------------------
  205|       |#else
  206|       |    cpu_features->has_sse2   = 0;
  207|       |#endif
  208|       |
  209|      1|#ifdef HAVE_PMMINTRIN_H
  210|      1|    cpu_features->has_sse3 = ((cpu_info[2] & CPUID_ECX_SSE3) != 0x0);
  ------------------
  |  |   39|      1|#define CPUID_ECX_SSE3    0x00000001
  ------------------
  211|       |#else
  212|       |    cpu_features->has_sse3   = 0;
  213|       |#endif
  214|       |
  215|      1|#ifdef HAVE_TMMINTRIN_H
  216|      1|    cpu_features->has_ssse3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0x0);
  ------------------
  |  |   41|      1|#define CPUID_ECX_SSSE3   0x00000200
  ------------------
  217|       |#else
  218|       |    cpu_features->has_ssse3  = 0;
  219|       |#endif
  220|       |
  221|      1|#ifdef HAVE_SMMINTRIN_H
  222|      1|    cpu_features->has_sse41 = ((cpu_info[2] & CPUID_ECX_SSE41) != 0x0);
  ------------------
  |  |   42|      1|#define CPUID_ECX_SSE41   0x00080000
  ------------------
  223|       |#else
  224|       |    cpu_features->has_sse41  = 0;
  225|       |#endif
  226|       |
  227|      1|    cpu_features->has_avx = 0;
  228|       |
  229|      1|    (void) xcr0;
  230|      1|#ifdef HAVE_AVXINTRIN_H
  231|      1|    if ((cpu_info[2] & (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) ==
  ------------------
  |  |   46|      1|#define CPUID_ECX_AVX     0x10000000
  ------------------
                  if ((cpu_info[2] & (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) ==
  ------------------
  |  |   44|      1|#define CPUID_ECX_XSAVE   0x04000000
  ------------------
                  if ((cpu_info[2] & (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) ==
  ------------------
  |  |   45|      1|#define CPUID_ECX_OSXSAVE 0x08000000
  ------------------
  |  Branch (231:9): [True: 1, False: 0]
  ------------------
  232|      1|        (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) {
  ------------------
  |  |   46|      1|#define CPUID_ECX_AVX     0x10000000
  ------------------
                      (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) {
  ------------------
  |  |   44|      1|#define CPUID_ECX_XSAVE   0x04000000
  ------------------
                      (CPUID_ECX_AVX | CPUID_ECX_XSAVE | CPUID_ECX_OSXSAVE)) {
  ------------------
  |  |   45|      1|#define CPUID_ECX_OSXSAVE 0x08000000
  ------------------
  233|      1|        xcr0 = 0U;
  234|       |# if defined(HAVE__XGETBV) || \
  235|       |        (defined(_MSC_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) && _MSC_FULL_VER >= 160040219)
  236|       |        xcr0 = (uint32_t) _xgetbv(0);
  237|       |# elif defined(_MSC_VER) && defined(_M_IX86)
  238|       |        /*
  239|       |         * Visual Studio documentation states that eax/ecx/edx don't need to
  240|       |         * be preserved in inline assembly code. But that doesn't seem to
  241|       |         * always hold true on Visual Studio 2010.
  242|       |         */
  243|       |        __asm {
  244|       |            push eax
  245|       |            push ecx
  246|       |            push edx
  247|       |            xor ecx, ecx
  248|       |            _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
  249|       |            mov xcr0, eax
  250|       |            pop edx
  251|       |            pop ecx
  252|       |            pop eax
  253|       |        }
  254|       |# elif defined(HAVE_AVX_ASM)
  255|       |        __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" /* XGETBV */
  256|      1|                             : "=a"(xcr0)
  257|      1|                             : "c"((uint32_t) 0U)
  258|      1|                             : "%edx");
  259|      1|# endif
  260|      1|        if ((xcr0 & (XCR0_SSE | XCR0_AVX)) == (XCR0_SSE | XCR0_AVX)) {
  ------------------
  |  |   51|      1|#define XCR0_SSE       0x00000002
  ------------------
                      if ((xcr0 & (XCR0_SSE | XCR0_AVX)) == (XCR0_SSE | XCR0_AVX)) {
  ------------------
  |  |   52|      1|#define XCR0_AVX       0x00000004
  ------------------
                      if ((xcr0 & (XCR0_SSE | XCR0_AVX)) == (XCR0_SSE | XCR0_AVX)) {
  ------------------
  |  |   51|      1|#define XCR0_SSE       0x00000002
  ------------------
                      if ((xcr0 & (XCR0_SSE | XCR0_AVX)) == (XCR0_SSE | XCR0_AVX)) {
  ------------------
  |  |   52|      1|#define XCR0_AVX       0x00000004
  ------------------
  |  Branch (260:13): [True: 1, False: 0]
  ------------------
  261|      1|            cpu_features->has_avx = 1;
  262|      1|        }
  263|      1|    }
  264|      1|#endif
  265|       |
  266|      1|    cpu_features->has_avx2 = 0;
  267|      1|#ifdef HAVE_AVX2INTRIN_H
  268|      1|    if (cpu_features->has_avx) {
  ------------------
  |  Branch (268:9): [True: 1, False: 0]
  ------------------
  269|      1|        unsigned int cpu_info7[4];
  270|       |
  271|      1|        _cpuid(cpu_info7, 0x00000007);
  272|      1|        cpu_features->has_avx2 = ((cpu_info7[1] & CPUID_EBX_AVX2) != 0x0);
  ------------------
  |  |   36|      1|#define CPUID_EBX_AVX2    0x00000020
  ------------------
  273|      1|    }
  274|      1|#endif
  275|       |
  276|      1|    cpu_features->has_avx512f = 0;
  277|      1|#ifdef HAVE_AVX512FINTRIN_H
  278|      1|    if (cpu_features->has_avx2) {
  ------------------
  |  Branch (278:9): [True: 1, False: 0]
  ------------------
  279|      1|        unsigned int cpu_info7[4];
  280|       |
  281|      1|        _cpuid(cpu_info7, 0x00000007);
  282|       |        /* LCOV_EXCL_START */
  283|      1|        if ((cpu_info7[1] & CPUID_EBX_AVX512F) == CPUID_EBX_AVX512F &&
  ------------------
  |  |   37|      1|#define CPUID_EBX_AVX512F 0x00010000
  ------------------
                      if ((cpu_info7[1] & CPUID_EBX_AVX512F) == CPUID_EBX_AVX512F &&
  ------------------
  |  |   37|      2|#define CPUID_EBX_AVX512F 0x00010000
  ------------------
  |  Branch (283:13): [True: 0, False: 1]
  ------------------
  284|      1|            (xcr0 & (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM))
  ------------------
  |  |   53|      0|#define XCR0_OPMASK    0x00000020
  ------------------
                          (xcr0 & (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM))
  ------------------
  |  |   54|      0|#define XCR0_ZMM_HI256 0x00000040
  ------------------
                          (xcr0 & (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM))
  ------------------
  |  |   55|      0|#define XCR0_HI16_ZMM  0x00000080
  ------------------
  |  Branch (284:13): [True: 0, False: 0]
  ------------------
  285|      0|            == (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM)) {
  ------------------
  |  |   53|      0|#define XCR0_OPMASK    0x00000020
  ------------------
                          == (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM)) {
  ------------------
  |  |   54|      0|#define XCR0_ZMM_HI256 0x00000040
  ------------------
                          == (XCR0_OPMASK | XCR0_ZMM_HI256 | XCR0_HI16_ZMM)) {
  ------------------
  |  |   55|      0|#define XCR0_HI16_ZMM  0x00000080
  ------------------
  286|      0|            cpu_features->has_avx512f = 1;
  287|      0|        }
  288|       |        /* LCOV_EXCL_STOP */
  289|      1|    }
  290|      1|#endif
  291|       |
  292|      1|#ifdef HAVE_WMMINTRIN_H
  293|      1|    cpu_features->has_pclmul = ((cpu_info[2] & CPUID_ECX_PCLMUL) != 0x0);
  ------------------
  |  |   40|      1|#define CPUID_ECX_PCLMUL  0x00000002
  ------------------
  294|      1|    cpu_features->has_aesni  = ((cpu_info[2] & CPUID_ECX_AESNI) != 0x0);
  ------------------
  |  |   43|      1|#define CPUID_ECX_AESNI   0x02000000
  ------------------
  295|       |#else
  296|       |    cpu_features->has_pclmul = 0;
  297|       |    cpu_features->has_aesni  = 0;
  298|       |#endif
  299|       |
  300|      1|#ifdef HAVE_RDRAND
  301|      1|    cpu_features->has_rdrand = ((cpu_info[2] & CPUID_ECX_RDRAND) != 0x0);
  ------------------
  |  |   47|      1|#define CPUID_ECX_RDRAND  0x40000000
  ------------------
  302|       |#else
  303|       |    cpu_features->has_rdrand = 0;
  304|       |#endif
  305|       |
  306|      1|    return 0;
  307|      1|}
runtime.c:_cpuid:
  152|      4|{
  153|       |#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
  154|       |    __cpuid((int *) cpu_info, cpu_info_type);
  155|       |#elif defined(HAVE_CPUID)
  156|       |    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
  157|       |# ifdef __i386__
  158|       |    __asm__ __volatile__(
  159|       |        "pushfl; pushfl; "
  160|       |        "popl %0; "
  161|       |        "movl %0, %1; xorl %2, %0; "
  162|       |        "pushl %0; "
  163|       |        "popfl; pushfl; popl %0; popfl"
  164|       |        : "=&r"(cpu_info[0]), "=&r"(cpu_info[1])
  165|       |        : "i"(0x200000));
  166|       |    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0x0) {
  167|       |        return; /* LCOV_EXCL_LINE */
  168|       |    }
  169|       |# endif
  170|       |# ifdef __i386__
  171|       |    __asm__ __volatile__("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1"
  172|       |                         : "=a"(cpu_info[0]), "=&r"(cpu_info[1]),
  173|       |                           "=c"(cpu_info[2]), "=d"(cpu_info[3])
  174|       |                         : "0"(cpu_info_type), "2"(0U));
  175|       |# elif defined(__x86_64__)
  176|       |    __asm__ __volatile__("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1"
  177|      4|                         : "=a"(cpu_info[0]), "=&r"(cpu_info[1]),
  178|      4|                           "=c"(cpu_info[2]), "=d"(cpu_info[3])
  179|      4|                         : "0"(cpu_info_type), "2"(0U));
  180|       |# else
  181|       |    __asm__ __volatile__("cpuid"
  182|       |                         : "=a"(cpu_info[0]), "=b"(cpu_info[1]),
  183|       |                           "=c"(cpu_info[2]), "=d"(cpu_info[3])
  184|       |                         : "0"(cpu_info_type), "2"(0U));
  185|       |# endif
  186|       |#else
  187|       |    (void) cpu_info_type;
  188|       |    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
  189|       |#endif
  190|      4|}

sodium_memzero:
  124|  5.74k|{
  125|       |#if defined(_WIN32) && !defined(__CRT_INLINE)
  126|       |    SecureZeroMemory(pnt, len);
  127|       |#elif defined(HAVE_MEMSET_S)
  128|       |    if (len > 0U && memset_s(pnt, (rsize_t) len, 0, (rsize_t) len) != 0) {
  129|       |        sodium_misuse(); /* LCOV_EXCL_LINE */
  130|       |    }
  131|       |#elif defined(HAVE_EXPLICIT_BZERO)
  132|       |    explicit_bzero(pnt, len);
  133|       |#elif defined(HAVE_MEMSET_EXPLICIT)
  134|       |    memset_explicit(pnt, 0, len);
  135|       |#elif defined(HAVE_EXPLICIT_MEMSET)
  136|       |    explicit_memset(pnt, 0, len);
  137|       |#elif HAVE_WEAK_SYMBOLS
  138|       |    if (len > 0U) {
  139|       |        memset(pnt, 0, len);
  140|       |        _sodium_dummy_symbol_to_prevent_memzero_lto(pnt, len);
  141|       |    }
  142|       |# ifdef HAVE_INLINE_ASM
  143|       |    __asm__ __volatile__ ("" : : "r"(pnt) : "memory");
  144|       |# endif
  145|       |#else
  146|       |    volatile unsigned char *volatile pnt_ =
  147|       |        (volatile unsigned char *volatile) pnt;
  148|       |    size_t i = (size_t) 0U;
  149|       |
  150|       |    while (i < len) {
  151|       |        pnt_[i++] = 0U;
  152|       |    }
  153|       |#endif
  154|  5.74k|}
_sodium_alloc_init:
  407|      1|{
  408|      1|#ifdef HAVE_ALIGNED_MALLOC
  409|      1|# if defined(_SC_PAGESIZE) && defined(HAVE_SYSCONF)
  410|      1|    long page_size_ = sysconf(_SC_PAGESIZE);
  411|      1|    if (page_size_ > 0L) {
  ------------------
  |  Branch (411:9): [True: 1, False: 0]
  ------------------
  412|      1|        page_size = (size_t) page_size_;
  413|      1|    }
  414|       |# elif defined(WINAPI_DESKTOP)
  415|       |    SYSTEM_INFO si;
  416|       |    GetSystemInfo(&si);
  417|       |    page_size = (size_t) si.dwPageSize;
  418|       |# elif !defined(PAGE_SIZE)
  419|       |#  warning Unknown page size
  420|       |# endif
  421|      1|    if (page_size < CANARY_SIZE || page_size < sizeof(size_t)) {
  ------------------
  |  |   65|      2|#define CANARY_SIZE 16U
  ------------------
  |  Branch (421:9): [True: 0, False: 1]
  |  Branch (421:36): [True: 0, False: 1]
  ------------------
  422|      0|        sodium_misuse(); /* LCOV_EXCL_LINE */
  423|      0|    }
  424|      1|#endif
  425|      1|    randombytes_buf(canary, sizeof canary);
  426|       |
  427|      1|    return 0;
  428|      1|}

LLVMFuzzerTestOneInput:
   21|    316|extern "C" int LLVMFuzzerTestOneInput(const unsigned char *data, size_t size) {
   22|    316|  int initialized = sodium_init();
   23|    316|  assert(initialized >= 0);
   24|       |
   25|    316|  setup_fake_random(data, size);
   26|       |
   27|    316|  unsigned char key[crypto_secretbox_KEYBYTES];
   28|    316|  unsigned char nonce[crypto_secretbox_NONCEBYTES];
   29|       |
   30|       |  // these use a deterministic generator
   31|    316|  crypto_secretbox_keygen(key);
   32|    316|  randombytes_buf(nonce, sizeof nonce);
   33|       |
   34|    316|  size_t ciphertext_len = crypto_secretbox_MACBYTES + size;
  ------------------
  |  |   24|    316|#define crypto_secretbox_MACBYTES crypto_secretbox_xsalsa20poly1305_MACBYTES
  |  |  ------------------
  |  |  |  |   23|    316|#define crypto_secretbox_xsalsa20poly1305_MACBYTES 16U
  |  |  ------------------
  ------------------
   35|    316|  unsigned char *ciphertext = (unsigned char *) malloc(ciphertext_len);
   36|       |
   37|    316|  crypto_secretbox_easy(ciphertext, data, size, nonce, key);
   38|       |
   39|    316|  unsigned char *decrypted = (unsigned char *) malloc(size);
   40|    316|  int err = crypto_secretbox_open_easy(decrypted, ciphertext, ciphertext_len, nonce, key);
   41|    316|  assert(err == 0);
   42|       |
   43|    316|  free((void *) ciphertext);
   44|    316|  free((void *) decrypted);
   45|       |
   46|    316|  return 0;
   47|    316|}

