/src/nss/lib/freebl/sha256-x86.c
| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | /* This Source Code Form is subject to the terms of the Mozilla Public | 
| 2 |  |  * License, v. 2.0. If a copy of the MPL was not distributed with this | 
| 3 |  |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | 
| 4 |  |  | 
| 5 |  | #ifdef USE_HW_SHA2 | 
| 6 |  |  | 
| 7 |  | #include <immintrin.h> | 
| 8 |  |  | 
| 9 |  | #ifdef FREEBL_NO_DEPEND | 
| 10 |  | #include "stubs.h" | 
| 11 |  | #endif | 
| 12 |  |  | 
| 13 |  | #include "blapii.h" | 
| 14 |  | #include "prcpucfg.h" | 
| 15 |  | #include "prtypes.h" /* for PRUintXX */ | 
| 16 |  | #include "prlong.h" | 
| 17 |  | #include "blapi.h" | 
| 18 |  | #include "sha256.h" | 
| 19 |  |  | 
| 20 |  | /* SHA-256 constants, K256. */ | 
| 21 |  | pre_align static const PRUint32 K256[64] post_align = { | 
| 22 |  |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, | 
| 23 |  |     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, | 
| 24 |  |     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, | 
| 25 |  |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, | 
| 26 |  |     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, | 
| 27 |  |     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, | 
| 28 |  |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, | 
| 29 |  |     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, | 
| 30 |  |     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, | 
| 31 |  |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, | 
| 32 |  |     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, | 
| 33 |  |     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, | 
| 34 |  |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, | 
| 35 |  |     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, | 
| 36 |  |     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, | 
| 37 |  |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | 
| 38 |  | }; | 
| 39 |  |  | 
| 40 |  | #define ROUND(n, a, b, c, d)                                \ | 
| 41 | 0 |     {                                                       \ | 
| 42 | 0 |         __m128i t = _mm_add_epi32(a, k##n);                 \ | 
| 43 | 0 |         w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \ | 
| 44 | 0 |         t = _mm_shuffle_epi32(t, 0x0e);                     \ | 
| 45 | 0 |         w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \ | 
| 46 | 0 |         if (n < 12) {                                       \ | 
| 47 | 0 |             a = _mm_sha256msg1_epu32(a, b);                 \ | 
| 48 | 0 |             a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \ | 
| 49 | 0 |             a = _mm_sha256msg2_epu32(a, d);                 \ | 
| 50 | 0 |         }                                                   \ | 
| 51 | 0 |     } | 
| 52 |  |  | 
| 53 |  | void | 
| 54 |  | SHA256_Compress_Native(SHA256Context *ctx) | 
| 55 | 0 | { | 
| 56 | 0 |     __m128i h0, h1, th; | 
| 57 | 0 |     __m128i a, b, c, d; | 
| 58 | 0 |     __m128i w0, w1; | 
| 59 | 0 |     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); | 
| 60 |  | 
 | 
| 61 | 0 |     const __m128i *K = (__m128i *)K256; | 
| 62 | 0 |     const __m128i k0 = _mm_load_si128(K); | 
| 63 | 0 |     const __m128i k1 = _mm_load_si128(K + 1); | 
| 64 | 0 |     const __m128i k2 = _mm_load_si128(K + 2); | 
| 65 | 0 |     const __m128i k3 = _mm_load_si128(K + 3); | 
| 66 | 0 |     const __m128i k4 = _mm_load_si128(K + 4); | 
| 67 | 0 |     const __m128i k5 = _mm_load_si128(K + 5); | 
| 68 | 0 |     const __m128i k6 = _mm_load_si128(K + 6); | 
| 69 | 0 |     const __m128i k7 = _mm_load_si128(K + 7); | 
| 70 | 0 |     const __m128i k8 = _mm_load_si128(K + 8); | 
| 71 | 0 |     const __m128i k9 = _mm_load_si128(K + 9); | 
| 72 | 0 |     const __m128i k10 = _mm_load_si128(K + 10); | 
| 73 | 0 |     const __m128i k11 = _mm_load_si128(K + 11); | 
| 74 | 0 |     const __m128i k12 = _mm_load_si128(K + 12); | 
| 75 | 0 |     const __m128i k13 = _mm_load_si128(K + 13); | 
| 76 | 0 |     const __m128i k14 = _mm_load_si128(K + 14); | 
| 77 | 0 |     const __m128i k15 = _mm_load_si128(K + 15); | 
| 78 |  | 
 | 
| 79 | 0 |     const __m128i *input = (__m128i *)ctx->u.b; | 
| 80 |  | 
 | 
| 81 | 0 |     h0 = _mm_loadu_si128((__m128i *)(ctx->h)); | 
| 82 | 0 |     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); | 
| 83 |  |  | 
| 84 |  |     /* H0123:4567 -> H01256:H2367 */ | 
| 85 | 0 |     th = _mm_shuffle_epi32(h0, 0xb1); | 
| 86 | 0 |     h1 = _mm_shuffle_epi32(h1, 0x1b); | 
| 87 | 0 |     h0 = _mm_alignr_epi8(th, h1, 8); | 
| 88 | 0 |     h1 = _mm_blend_epi16(h1, th, 0xf0); | 
| 89 |  | 
 | 
| 90 | 0 |     a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle); | 
| 91 | 0 |     b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle); | 
| 92 | 0 |     c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle); | 
| 93 | 0 |     d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle); | 
| 94 |  | 
 | 
| 95 | 0 |     w0 = h0; | 
| 96 | 0 |     w1 = h1; | 
| 97 |  | 
 | 
| 98 | 0 |     ROUND(0, a, b, c, d) | 
| 99 | 0 |     ROUND(1, b, c, d, a) | 
| 100 | 0 |     ROUND(2, c, d, a, b) | 
| 101 | 0 |     ROUND(3, d, a, b, c) | 
| 102 | 0 |     ROUND(4, a, b, c, d) | 
| 103 | 0 |     ROUND(5, b, c, d, a) | 
| 104 | 0 |     ROUND(6, c, d, a, b) | 
| 105 | 0 |     ROUND(7, d, a, b, c) | 
| 106 | 0 |     ROUND(8, a, b, c, d) | 
| 107 | 0 |     ROUND(9, b, c, d, a) | 
| 108 | 0 |     ROUND(10, c, d, a, b) | 
| 109 | 0 |     ROUND(11, d, a, b, c) | 
| 110 | 0 |     ROUND(12, a, b, c, d) | 
| 111 | 0 |     ROUND(13, b, c, d, a) | 
| 112 | 0 |     ROUND(14, c, d, a, b) | 
| 113 | 0 |     ROUND(15, d, a, b, c) | 
| 114 |  | 
 | 
| 115 | 0 |     h0 = _mm_add_epi32(h0, w0); | 
| 116 | 0 |     h1 = _mm_add_epi32(h1, w1); | 
| 117 |  |  | 
| 118 |  |     /* H0145:2367 -> H0123:4567 */ | 
| 119 | 0 |     th = _mm_shuffle_epi32(h0, 0x1b); | 
| 120 | 0 |     h1 = _mm_shuffle_epi32(h1, 0xb1); | 
| 121 | 0 |     h0 = _mm_blend_epi16(th, h1, 0xf0); | 
| 122 | 0 |     h1 = _mm_alignr_epi8(h1, th, 8); | 
| 123 |  | 
 | 
| 124 | 0 |     _mm_storeu_si128((__m128i *)ctx->h, h0); | 
| 125 | 0 |     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); | 
| 126 | 0 | } | 
| 127 |  |  | 
| 128 |  | void | 
| 129 |  | SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, | 
| 130 |  |                      unsigned int inputLen) | 
| 131 | 0 | { | 
| 132 | 0 |     __m128i h0, h1, th; | 
| 133 | 0 |     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); | 
| 134 |  | 
 | 
| 135 | 0 |     const __m128i *K = (__m128i *)K256; | 
| 136 | 0 |     const __m128i k0 = _mm_load_si128(K); | 
| 137 | 0 |     const __m128i k1 = _mm_load_si128(K + 1); | 
| 138 | 0 |     const __m128i k2 = _mm_load_si128(K + 2); | 
| 139 | 0 |     const __m128i k3 = _mm_load_si128(K + 3); | 
| 140 | 0 |     const __m128i k4 = _mm_load_si128(K + 4); | 
| 141 | 0 |     const __m128i k5 = _mm_load_si128(K + 5); | 
| 142 | 0 |     const __m128i k6 = _mm_load_si128(K + 6); | 
| 143 | 0 |     const __m128i k7 = _mm_load_si128(K + 7); | 
| 144 | 0 |     const __m128i k8 = _mm_load_si128(K + 8); | 
| 145 | 0 |     const __m128i k9 = _mm_load_si128(K + 9); | 
| 146 | 0 |     const __m128i k10 = _mm_load_si128(K + 10); | 
| 147 | 0 |     const __m128i k11 = _mm_load_si128(K + 11); | 
| 148 | 0 |     const __m128i k12 = _mm_load_si128(K + 12); | 
| 149 | 0 |     const __m128i k13 = _mm_load_si128(K + 13); | 
| 150 | 0 |     const __m128i k14 = _mm_load_si128(K + 14); | 
| 151 | 0 |     const __m128i k15 = _mm_load_si128(K + 15); | 
| 152 |  | 
 | 
| 153 | 0 |     unsigned int inBuf = ctx->sizeLo & 0x3f; | 
| 154 | 0 |     if (!inputLen) { | 
| 155 | 0 |         return; | 
| 156 | 0 |     } | 
| 157 |  |  | 
| 158 |  |     /* Add inputLen into the count of bytes processed, before processing */ | 
| 159 | 0 |     if ((ctx->sizeLo += inputLen) < inputLen) { | 
| 160 | 0 |         ctx->sizeHi++; | 
| 161 | 0 |     } | 
| 162 |  |  | 
| 163 |  |     /* if data already in buffer, attempt to fill rest of buffer */ | 
| 164 | 0 |     if (inBuf) { | 
| 165 | 0 |         unsigned int todo = SHA256_BLOCK_LENGTH - inBuf; | 
| 166 | 0 |         if (inputLen < todo) { | 
| 167 | 0 |             todo = inputLen; | 
| 168 | 0 |         } | 
| 169 | 0 |         memcpy(ctx->u.b + inBuf, input, todo); | 
| 170 | 0 |         input += todo; | 
| 171 | 0 |         inputLen -= todo; | 
| 172 | 0 |         if (inBuf + todo == SHA256_BLOCK_LENGTH) { | 
| 173 | 0 |             SHA256_Compress_Native(ctx); | 
| 174 | 0 |         } | 
| 175 | 0 |     } | 
| 176 |  | 
 | 
| 177 | 0 |     h0 = _mm_loadu_si128((__m128i *)(ctx->h)); | 
| 178 | 0 |     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); | 
| 179 |  |  | 
| 180 |  |     /* H0123:4567 -> H01256:H2367 */ | 
| 181 | 0 |     th = _mm_shuffle_epi32(h0, 0xb1); | 
| 182 | 0 |     h1 = _mm_shuffle_epi32(h1, 0x1b); | 
| 183 | 0 |     h0 = _mm_alignr_epi8(th, h1, 8); | 
| 184 | 0 |     h1 = _mm_blend_epi16(h1, th, 0xf0); | 
| 185 |  |  | 
| 186 |  |     /* if enough data to fill one or more whole buffers, process them. */ | 
| 187 | 0 |     while (inputLen >= SHA256_BLOCK_LENGTH) { | 
| 188 | 0 |         __m128i a, b, c, d; | 
| 189 | 0 |         __m128i w0, w1; | 
| 190 | 0 |         a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle); | 
| 191 | 0 |         b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle); | 
| 192 | 0 |         c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle); | 
| 193 | 0 |         d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle); | 
| 194 | 0 |         input += SHA256_BLOCK_LENGTH; | 
| 195 | 0 |         inputLen -= SHA256_BLOCK_LENGTH; | 
| 196 |  | 
 | 
| 197 | 0 |         w0 = h0; | 
| 198 | 0 |         w1 = h1; | 
| 199 |  | 
 | 
| 200 | 0 |         ROUND(0, a, b, c, d) | 
| 201 | 0 |         ROUND(1, b, c, d, a) | 
| 202 | 0 |         ROUND(2, c, d, a, b) | 
| 203 | 0 |         ROUND(3, d, a, b, c) | 
| 204 | 0 |         ROUND(4, a, b, c, d) | 
| 205 | 0 |         ROUND(5, b, c, d, a) | 
| 206 | 0 |         ROUND(6, c, d, a, b) | 
| 207 | 0 |         ROUND(7, d, a, b, c) | 
| 208 | 0 |         ROUND(8, a, b, c, d) | 
| 209 | 0 |         ROUND(9, b, c, d, a) | 
| 210 | 0 |         ROUND(10, c, d, a, b) | 
| 211 | 0 |         ROUND(11, d, a, b, c) | 
| 212 | 0 |         ROUND(12, a, b, c, d) | 
| 213 | 0 |         ROUND(13, b, c, d, a) | 
| 214 | 0 |         ROUND(14, c, d, a, b) | 
| 215 | 0 |         ROUND(15, d, a, b, c) | 
| 216 |  | 
 | 
| 217 | 0 |         h0 = _mm_add_epi32(h0, w0); | 
| 218 | 0 |         h1 = _mm_add_epi32(h1, w1); | 
| 219 | 0 |     } | 
| 220 |  |  | 
| 221 |  |     // H01234567 -> H01256 and H2367 | 
| 222 | 0 |     th = _mm_shuffle_epi32(h0, 0x1b); | 
| 223 | 0 |     h1 = _mm_shuffle_epi32(h1, 0xb1); | 
| 224 | 0 |     h0 = _mm_blend_epi16(th, h1, 0xf0); | 
| 225 | 0 |     h1 = _mm_alignr_epi8(h1, th, 8); | 
| 226 |  | 
 | 
| 227 | 0 |     _mm_storeu_si128((__m128i *)ctx->h, h0); | 
| 228 | 0 |     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); | 
| 229 |  |  | 
| 230 |  |     /* if data left over, fill it into buffer */ | 
| 231 | 0 |     if (inputLen) { | 
| 232 | 0 |         memcpy(ctx->u.b, input, inputLen); | 
| 233 | 0 |     } | 
| 234 | 0 | } | 
| 235 |  |  | 
| 236 |  | #endif /* USE_HW_SHA2 */ |