Coverage Report

Created: 2024-09-19 09:45

/proc/self/cwd/external/utf8_range/utf8_validity.cc
Line
Count
Source
1
// Copyright 2022 Google LLC
2
//
3
// Use of this source code is governed by an MIT-style
4
// license that can be found in the LICENSE file or at
5
// https://opensource.org/licenses/MIT.
6
7
/* This is a wrapper for the Google range-sse.cc algorithm which checks whether a
8
 * sequence of bytes is a valid UTF-8 sequence and finds the longest valid prefix of
9
 * the UTF-8 sequence.
10
 *
11
 * The key difference is that it checks for as much ASCII symbols as possible
12
 * and then falls back to the range-sse.cc algorithm. The changes to the
13
 * algorithm are cosmetic, mostly to trick the clang compiler to produce optimal
14
 * code.
15
 *
16
 * For API see the utf8_validity.h header.
17
 */
18
#include "utf8_validity.h"
19
20
#include <cstddef>
21
#include <cstdint>
22
23
#include "absl/strings/ascii.h"
24
#include "absl/strings/string_view.h"
25
26
#ifdef __SSE4_1__
27
#include <emmintrin.h>
28
#include <smmintrin.h>
29
#include <tmmintrin.h>
30
#endif
31
32
namespace utf8_range {
33
namespace {
34
35
49.3M
inline uint64_t UNALIGNED_LOAD64(const void* p) {
36
49.3M
  uint64_t t;
37
49.3M
  memcpy(&t, p, sizeof t);
38
49.3M
  return t;
39
49.3M
}
40
41
7.06M
inline bool TrailByteOk(const char c) {
42
7.06M
  return static_cast<int8_t>(c) <= static_cast<int8_t>(0xBF);
43
7.06M
}
44
45
/* If ReturnPosition is false then it returns 1 if |data| is a valid utf8
46
 * sequence, otherwise returns 0.
47
 * If ReturnPosition is set to true, returns the length in bytes of the prefix
48
   of |data| that is all structurally valid UTF-8.
49
 */
50
template <bool ReturnPosition>
51
11.6M
size_t ValidUTF8Span(const char* data, const char* end) {
52
  /* We return err_pos in the loop which is always 0 if !ReturnPosition */
53
11.6M
  size_t err_pos = 0;
54
11.6M
  size_t codepoint_bytes = 0;
55
  /* The early check is done because of early continue's on codepoints of all
56
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
57
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
58
   * improve readability of the codepoint validity check.
59
   */
60
90.0M
  while (data + codepoint_bytes < end) {
61
78.3M
    if (ReturnPosition) {
62
481k
      err_pos += codepoint_bytes;
63
481k
    }
64
78.3M
    data += codepoint_bytes;
65
78.3M
    const size_t len = end - data;
66
78.3M
    const unsigned char byte1 = data[0];
67
68
    /* We do not skip many ascii bytes at the same time as this function is
69
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
70
       don't think that cases where non-ASCII codepoints are followed by ascii
71
       happen often. For small strings it also introduces some penalty. For
72
       purely ascii UTF8 strings (which is the overwhelming case) we call
73
       SkipAscii function which is multiplatform and extremely fast.
74
     */
75
    /* [00..7F] ASCII -> 1 byte */
76
78.3M
    if (absl::ascii_isascii(byte1)) {
77
75.9M
      codepoint_bytes = 1;
78
75.9M
      continue;
79
75.9M
    }
80
    /* [C2..DF], [80..BF] -> 2 bytes */
81
2.44M
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
82
79.1k
      codepoint_bytes = 2;
83
79.1k
      continue;
84
79.1k
    }
85
2.36M
    if (len >= 3) {
86
2.35M
      const unsigned char byte2 = data[1];
87
2.35M
      const unsigned char byte3 = data[2];
88
89
      /* Is byte2, byte3 between [0x80, 0xBF]
90
       * Check for 0x80 was done above.
91
       */
92
2.35M
      if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
93
15.0k
        return err_pos;
94
15.0k
      }
95
96
2.34M
      if (/* E0, A0..BF, 80..BF */
97
2.34M
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
98
           /* E1..EC, 80..BF, 80..BF */
99
2.34M
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
100
           /* ED, 80..9F, 80..BF */
101
2.34M
           (byte1 == 0xED && byte2 <= 0x9F) ||
102
           /* EE..EF, 80..BF, 80..BF */
103
2.34M
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
104
61.4k
        codepoint_bytes = 3;
105
61.4k
        continue;
106
61.4k
      }
107
2.28M
      if (len >= 4) {
108
2.27M
        const unsigned char byte4 = data[3];
109
        /* Is byte4 between 0x80 ~ 0xBF */
110
2.27M
        if (!TrailByteOk(byte4)) {
111
3.26k
          return err_pos;
112
3.26k
        }
113
114
2.27M
        if (/* F0, 90..BF, 80..BF, 80..BF */
115
2.27M
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
116
             /* F1..F3, 80..BF, 80..BF, 80..BF */
117
2.27M
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
118
             /* F4, 80..8F, 80..BF, 80..BF */
119
2.27M
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
120
2.25M
          codepoint_bytes = 4;
121
2.25M
          continue;
122
2.25M
        }
123
2.27M
      }
124
2.28M
    }
125
27.4k
    return err_pos;
126
2.36M
  }
127
11.6M
  if (ReturnPosition) {
128
49.0k
    err_pos += codepoint_bytes;
129
49.0k
  }
130
  /* if ReturnPosition is false, this returns 1.
131
   * if ReturnPosition is true, this returns err_pos.
132
   */
133
11.6M
  return err_pos + (1 - ReturnPosition);
134
11.6M
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8Span<false>(char const*, char const*)
Line
Count
Source
51
11.6M
size_t ValidUTF8Span(const char* data, const char* end) {
52
  /* We return err_pos in the loop which is always 0 if !ReturnPosition */
53
11.6M
  size_t err_pos = 0;
54
11.6M
  size_t codepoint_bytes = 0;
55
  /* The early check is done because of early continue's on codepoints of all
56
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
57
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
58
   * improve readability of the codepoint validity check.
59
   */
60
89.4M
  while (data + codepoint_bytes < end) {
61
77.9M
    if (ReturnPosition) {
62
0
      err_pos += codepoint_bytes;
63
0
    }
64
77.9M
    data += codepoint_bytes;
65
77.9M
    const size_t len = end - data;
66
77.9M
    const unsigned char byte1 = data[0];
67
68
    /* We do not skip many ascii bytes at the same time as this function is
69
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
70
       don't think that cases where non-ASCII codepoints are followed by ascii
71
       happen often. For small strings it also introduces some penalty. For
72
       purely ascii UTF8 strings (which is the overwhelming case) we call
73
       SkipAscii function which is multiplatform and extremely fast.
74
     */
75
    /* [00..7F] ASCII -> 1 byte */
76
77.9M
    if (absl::ascii_isascii(byte1)) {
77
75.4M
      codepoint_bytes = 1;
78
75.4M
      continue;
79
75.4M
    }
80
    /* [C2..DF], [80..BF] -> 2 bytes */
81
2.43M
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
82
78.0k
      codepoint_bytes = 2;
83
78.0k
      continue;
84
78.0k
    }
85
2.35M
    if (len >= 3) {
86
2.34M
      const unsigned char byte2 = data[1];
87
2.34M
      const unsigned char byte3 = data[2];
88
89
      /* Is byte2, byte3 between [0x80, 0xBF]
90
       * Check for 0x80 was done above.
91
       */
92
2.34M
      if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
93
12.8k
        return err_pos;
94
12.8k
      }
95
96
2.33M
      if (/* E0, A0..BF, 80..BF */
97
2.33M
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
98
           /* E1..EC, 80..BF, 80..BF */
99
2.33M
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
100
           /* ED, 80..9F, 80..BF */
101
2.33M
           (byte1 == 0xED && byte2 <= 0x9F) ||
102
           /* EE..EF, 80..BF, 80..BF */
103
2.33M
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
104
58.3k
        codepoint_bytes = 3;
105
58.3k
        continue;
106
58.3k
      }
107
2.27M
      if (len >= 4) {
108
2.27M
        const unsigned char byte4 = data[3];
109
        /* Is byte4 between 0x80 ~ 0xBF */
110
2.27M
        if (!TrailByteOk(byte4)) {
111
2.60k
          return err_pos;
112
2.60k
        }
113
114
2.26M
        if (/* F0, 90..BF, 80..BF, 80..BF */
115
2.26M
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
116
             /* F1..F3, 80..BF, 80..BF, 80..BF */
117
2.26M
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
118
             /* F4, 80..8F, 80..BF, 80..BF */
119
2.26M
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
120
2.25M
          codepoint_bytes = 4;
121
2.25M
          continue;
122
2.25M
        }
123
2.26M
      }
124
2.27M
    }
125
26.6k
    return err_pos;
126
2.35M
  }
127
11.5M
  if (ReturnPosition) {
128
0
    err_pos += codepoint_bytes;
129
0
  }
130
  /* if ReturnPosition is false, this returns 1.
131
   * if ReturnPosition is true, this returns err_pos.
132
   */
133
11.5M
  return err_pos + (1 - ReturnPosition);
134
11.6M
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8Span<true>(char const*, char const*)
Line
Count
Source
51
52.6k
size_t ValidUTF8Span(const char* data, const char* end) {
52
  /* We return err_pos in the loop which is always 0 if !ReturnPosition */
53
52.6k
  size_t err_pos = 0;
54
52.6k
  size_t codepoint_bytes = 0;
55
  /* The early check is done because of early continue's on codepoints of all
56
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
57
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
58
   * improve readability of the codepoint validity check.
59
   */
60
530k
  while (data + codepoint_bytes < end) {
61
481k
    if (ReturnPosition) {
62
481k
      err_pos += codepoint_bytes;
63
481k
    }
64
481k
    data += codepoint_bytes;
65
481k
    const size_t len = end - data;
66
481k
    const unsigned char byte1 = data[0];
67
68
    /* We do not skip many ascii bytes at the same time as this function is
69
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
70
       don't think that cases where non-ASCII codepoints are followed by ascii
71
       happen often. For small strings it also introduces some penalty. For
72
       purely ascii UTF8 strings (which is the overwhelming case) we call
73
       SkipAscii function which is multiplatform and extremely fast.
74
     */
75
    /* [00..7F] ASCII -> 1 byte */
76
481k
    if (absl::ascii_isascii(byte1)) {
77
469k
      codepoint_bytes = 1;
78
469k
      continue;
79
469k
    }
80
    /* [C2..DF], [80..BF] -> 2 bytes */
81
11.9k
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
82
1.02k
      codepoint_bytes = 2;
83
1.02k
      continue;
84
1.02k
    }
85
10.9k
    if (len >= 3) {
86
10.8k
      const unsigned char byte2 = data[1];
87
10.8k
      const unsigned char byte3 = data[2];
88
89
      /* Is byte2, byte3 between [0x80, 0xBF]
90
       * Check for 0x80 was done above.
91
       */
92
10.8k
      if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
93
2.19k
        return err_pos;
94
2.19k
      }
95
96
8.60k
      if (/* E0, A0..BF, 80..BF */
97
8.60k
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
98
           /* E1..EC, 80..BF, 80..BF */
99
8.60k
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
100
           /* ED, 80..9F, 80..BF */
101
8.60k
           (byte1 == 0xED && byte2 <= 0x9F) ||
102
           /* EE..EF, 80..BF, 80..BF */
103
8.60k
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
104
3.09k
        codepoint_bytes = 3;
105
3.09k
        continue;
106
3.09k
      }
107
5.51k
      if (len >= 4) {
108
5.47k
        const unsigned char byte4 = data[3];
109
        /* Is byte4 between 0x80 ~ 0xBF */
110
5.47k
        if (!TrailByteOk(byte4)) {
111
655
          return err_pos;
112
655
        }
113
114
4.82k
        if (/* F0, 90..BF, 80..BF, 80..BF */
115
4.82k
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
116
             /* F1..F3, 80..BF, 80..BF, 80..BF */
117
4.82k
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
118
             /* F4, 80..8F, 80..BF, 80..BF */
119
4.82k
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
120
4.23k
          codepoint_bytes = 4;
121
4.23k
          continue;
122
4.23k
        }
123
4.82k
      }
124
5.51k
    }
125
771
    return err_pos;
126
10.9k
  }
127
49.0k
  if (ReturnPosition) {
128
49.0k
    err_pos += codepoint_bytes;
129
49.0k
  }
130
  /* if ReturnPosition is false, this returns 1.
131
   * if ReturnPosition is true, this returns err_pos.
132
   */
133
49.0k
  return err_pos + (1 - ReturnPosition);
134
52.6k
}
135
136
#ifdef __SSE4_1__
137
/* Returns the number of bytes needed to skip backwards to get to the first
138
   byte of codepoint.
139
 */
140
inline int CodepointSkipBackwards(int32_t codepoint_word) {
141
  const int8_t* const codepoint =
142
      reinterpret_cast<const int8_t*>(&codepoint_word);
143
  if (!TrailByteOk(codepoint[3])) {
144
    return 1;
145
  } else if (!TrailByteOk(codepoint[2])) {
146
    return 2;
147
  } else if (!TrailByteOk(codepoint[1])) {
148
    return 3;
149
  }
150
  return 0;
151
}
152
#endif  // __SSE4_1__
153
154
/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional
155
   as most strings to check for validity consist only of 1 byte codepoints.
156
 */
157
11.6M
inline const char* SkipAscii(const char* data, const char* end) {
158
60.9M
  while (8 <= end - data &&
159
60.9M
         (UNALIGNED_LOAD64(data) & 0x8080808080808080) == 0) {
160
49.2M
    data += 8;
161
49.2M
  }
162
41.8M
  while (data < end && absl::ascii_isascii(*data)) {
163
30.2M
    ++data;
164
30.2M
  }
165
11.6M
  return data;
166
11.6M
}
167
168
template <bool ReturnPosition>
169
12.0M
size_t ValidUTF8(const char* data, size_t len) {
170
12.0M
  if (len == 0) return 1 - ReturnPosition;
171
11.6M
  const char* const end = data + len;
172
11.6M
  data = SkipAscii(data, end);
173
  /* SIMD algorithm always outperforms the naive version for any data of
174
     length >=16.
175
   */
176
11.6M
  if (end - data < 16) {
177
11.6M
    return (ReturnPosition ? (data - (end - len)) : 0) +
178
11.6M
           ValidUTF8Span<ReturnPosition>(data, end);
179
11.6M
  }
180
40.5k
#ifndef __SSE4_1__
181
40.5k
  return (ReturnPosition ? (data - (end - len)) : 0) +
182
40.5k
         ValidUTF8Span<ReturnPosition>(data, end);
183
#else
184
  /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
185
   * using superscalar instructions.
186
   * The mapping between ranges of codepoint and their corresponding utf-8
187
   * sequences is below.
188
   */
189
190
  /*
191
   * U+0000...U+007F     00...7F
192
   * U+0080...U+07FF     C2...DF 80...BF
193
   * U+0800...U+0FFF     E0      A0...BF 80...BF
194
   * U+1000...U+CFFF     E1...EC 80...BF 80...BF
195
   * U+D000...U+D7FF     ED      80...9F 80...BF
196
   * U+E000...U+FFFF     EE...EF 80...BF 80...BF
197
   * U+10000...U+3FFFF   F0      90...BF 80...BF 80...BF
198
   * U+40000...U+FFFFF   F1...F3 80...BF 80...BF 80...BF
199
   * U+100000...U+10FFFF F4      80...8F 80...BF 80...BF
200
   */
201
202
  /* First we compute the type for each byte, as given by the table below.
203
   * This type will be used as an index later on.
204
   */
205
206
  /*
207
   * Index  Min Max Byte Type
208
   *  0     00  7F  Single byte sequence
209
   *  1,2,3 80  BF  Second, third and fourth byte for many of the sequences.
210
   *  4     A0  BF  Second byte after E0
211
   *  5     80  9F  Second byte after ED
212
   *  6     90  BF  Second byte after F0
213
   *  7     80  8F  Second byte after F4
214
   *  8     C2  F4  First non ASCII byte
215
   *  9..15 7F  80  Invalid byte
216
   */
217
218
  /* After the first step we compute the index for all bytes, then we permute
219
     the bytes according to their indices to check the ranges from the range
220
     table.
221
   * The range for a given type can be found in the range_min_table and
222
     range_max_table, the range for type/index X is in range_min_table[X] ...
223
     range_max_table[X].
224
   */
225
226
  /* Algorithm:
227
   * Put index zero to all bytes.
228
   * Find all non ASCII characters, give them index 8.
229
   * For each tail byte in a codepoint sequence, give it an index corresponding
230
     to the 1 based index from the end.
231
   * If the first byte of the codepoint is in the [C0...DF] range, we write
232
     index 1 in the following byte.
233
   * If the first byte of the codepoint is in the range [E0...EF], we write
234
     indices 2 and 1 in the next two bytes.
235
   * If the first byte of the codepoint is in the range [F0...FF] we write
236
     indices 3,2,1 into the next three bytes.
237
   * For finding the number of bytes we need to look at high nibbles (4 bits)
238
     and do the lookup from the table, it can be done with shift by 4 + shuffle
239
     instructions. We call it `first_len`.
240
   * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
241
   * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
242
   * Again to get the indices of the 4th bytes.
243
   * Take OR of all that 4 values and check within range.
244
   */
245
  /* For example:
246
   * input       C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
247
   * first_len   1  0  0  2  0  0  0  3  0  0  0  0  3  0  0  0
248
   * 1st byte    8  0  0  8  0  0  0  8  0  0  0  0  8  0  0  0
249
   * 2nd byte    0  1  0  0  2  0  0  0  3  0  0  0  0  3  0  0 // Shift + sub
250
   * 3rd byte    0  0  0  0  0  1  0  0  0  2  0  0  0  0  2  0 // Shift + sub
251
   * 4th byte    0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  1 // Shift + sub
252
   * Index       8  1  0  8  2  1  0  8  3  2  1  0  8  3  2  1 // OR of results
253
   */
254
255
  /* Checking for errors:
256
   * Error checking is done by looking up the high nibble (4 bits) of each byte
257
     against an error checking table.
258
   * Because the lookup value for the second byte depends of the value of the
259
     first byte in codepoint, we use saturated operations to adjust the index.
260
   * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
261
     match the correct index.
262
       * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
263
         F4 -> 5
264
       * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
265
         match the adjustment
266
       * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
267
         be more 128 and lookup in ef_fe_table will return 0 but for F0
268
         and F4 it will be 4 and 5 accordingly
269
   */
270
  /*
271
   * Then just check the appropriate ranges with greater/smaller equal
272
     instructions. Check tail with a naive algorithm.
273
   * To save from previous 16 byte checks we just align previous_first_len to
274
     get correct continuations of the codepoints.
275
   */
276
277
  /*
278
   * Map high nibble of "First Byte" to legal character length minus 1
279
   * 0x00 ~ 0xBF --> 0
280
   * 0xC0 ~ 0xDF --> 1
281
   * 0xE0 ~ 0xEF --> 2
282
   * 0xF0 ~ 0xFF --> 3
283
   */
284
  const __m128i first_len_table =
285
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
286
287
  /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
288
  const __m128i first_range_table =
289
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
290
291
  /*
292
   * Range table, map range index to min and max values
293
   */
294
  const __m128i range_min_table =
295
      _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
296
                    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
297
298
  const __m128i range_max_table =
299
      _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
300
                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
301
302
  /*
303
   * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
304
   * which the Second Byte are not 80~BF. It contains "range index adjustment".
305
   * +------------+---------------+------------------+----------------+
306
   * | First Byte | original range| range adjustment | adjusted range |
307
   * +------------+---------------+------------------+----------------+
308
   * | E0         | 2             | 2                | 4              |
309
   * +------------+---------------+------------------+----------------+
310
   * | ED         | 2             | 3                | 5              |
311
   * +------------+---------------+------------------+----------------+
312
   * | F0         | 3             | 3                | 6              |
313
   * +------------+---------------+------------------+----------------+
314
   * | F4         | 4             | 4                | 8              |
315
   * +------------+---------------+------------------+----------------+
316
   */
317
318
  /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
319
  // The values represent the adjustment in the Range Index table for a correct
320
  // index.
321
  const __m128i df_ee_table =
322
      _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
323
324
  /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
325
  // The values represent the adjustment in the Range Index table for a correct
326
  // index.
327
  const __m128i ef_fe_table =
328
      _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
329
330
  __m128i prev_input = _mm_set1_epi8(0);
331
  __m128i prev_first_len = _mm_set1_epi8(0);
332
  __m128i error = _mm_set1_epi8(0);
333
  while (end - data >= 16) {
334
    const __m128i input =
335
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
336
337
    /* high_nibbles = input >> 4 */
338
    const __m128i high_nibbles =
339
        _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
340
341
    /* first_len = legal character length minus 1 */
342
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
343
    /* first_len = first_len_table[high_nibbles] */
344
    __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
345
346
    /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
347
    /* range = first_range_table[high_nibbles] */
348
    __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
349
350
    /* Second Byte: set range index to first_len */
351
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
352
    /* range |= (first_len, prev_first_len) << 1 byte */
353
    range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
354
355
    /* Third Byte: set range index to saturate_sub(first_len, 1) */
356
    /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
357
    __m128i tmp1;
358
    __m128i tmp2;
359
    /* tmp1 = saturate_sub(first_len, 1) */
360
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
361
    /* tmp2 = saturate_sub(prev_first_len, 1) */
362
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
363
    /* range |= (tmp1, tmp2) << 2 bytes */
364
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
365
366
    /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
367
    /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
368
    /* tmp1 = saturate_sub(first_len, 2) */
369
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
370
    /* tmp2 = saturate_sub(prev_first_len, 2) */
371
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
372
    /* range |= (tmp1, tmp2) << 3 bytes */
373
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
374
375
    /*
376
     * Now we have below range indices calculated
377
     * Correct cases:
378
     * - 8 for C0~FF
379
     * - 3 for 1st byte after F0~FF
380
     * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
381
     * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
382
     *         3rd byte after F0~FF
383
     * - 0 for others
384
     * Error cases:
385
     *   >9 for non ascii First Byte overlapping
386
     *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
387
     */
388
389
    /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
390
    /* Overlaps lead to index 9~15, which are illegal in range table */
391
    __m128i shift1;
392
    __m128i pos;
393
    __m128i range2;
394
    /* shift1 = (input, prev_input) << 1 byte */
395
    shift1 = _mm_alignr_epi8(input, prev_input, 15);
396
    pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
397
    /*
398
     * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
399
     * pos:     | 0   1      15 | 16  17           239| 240 241    255|
400
     * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
401
     * pos+112: | 112 113    127|       >= 128        |     >= 128    |
402
     */
403
    tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
404
    range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
405
    tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
406
    range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
407
408
    range = _mm_add_epi8(range, range2);
409
410
    /* Load min and max values per calculated range index */
411
    __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
412
    __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
413
414
    /* Check value range */
415
    if (ReturnPosition) {
416
      error = _mm_cmplt_epi8(input, min_range);
417
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
418
      /* 5% performance drop from this conditional branch */
419
      if (!_mm_testz_si128(error, error)) {
420
        break;
421
      }
422
    } else {
423
      error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
424
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
425
    }
426
427
    prev_input = input;
428
    prev_first_len = first_len;
429
430
    data += 16;
431
  }
432
  /* If we got to the end, we don't need to skip any bytes backwards */
433
  if (ReturnPosition && (data - (end - len)) == 0) {
434
    return ValidUTF8Span<true>(data, end);
435
  }
436
  /* Find previous codepoint (not 80~BF) */
437
  data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
438
  if (ReturnPosition) {
439
    return (data - (end - len)) + ValidUTF8Span<true>(data, end);
440
  }
441
  /* Test if there was any error */
442
  if (!_mm_testz_si128(error, error)) {
443
    return 0;
444
  }
445
  /* Check the tail */
446
  return ValidUTF8Span<false>(data, end);
447
#endif
448
11.6M
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8<false>(char const*, unsigned long)
Line
Count
Source
169
12.0M
size_t ValidUTF8(const char* data, size_t len) {
170
12.0M
  if (len == 0) return 1 - ReturnPosition;
171
11.6M
  const char* const end = data + len;
172
11.6M
  data = SkipAscii(data, end);
173
  /* SIMD algorithm always outperforms the naive version for any data of
174
     length >=16.
175
   */
176
11.6M
  if (end - data < 16) {
177
11.5M
    return (ReturnPosition ? (data - (end - len)) : 0) +
178
11.5M
           ValidUTF8Span<ReturnPosition>(data, end);
179
11.5M
  }
180
37.2k
#ifndef __SSE4_1__
181
37.2k
  return (ReturnPosition ? (data - (end - len)) : 0) +
182
37.2k
         ValidUTF8Span<ReturnPosition>(data, end);
183
#else
184
  /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
185
   * using superscalar instructions.
186
   * The mapping between ranges of codepoint and their corresponding utf-8
187
   * sequences is below.
188
   */
189
190
  /*
191
   * U+0000...U+007F     00...7F
192
   * U+0080...U+07FF     C2...DF 80...BF
193
   * U+0800...U+0FFF     E0      A0...BF 80...BF
194
   * U+1000...U+CFFF     E1...EC 80...BF 80...BF
195
   * U+D000...U+D7FF     ED      80...9F 80...BF
196
   * U+E000...U+FFFF     EE...EF 80...BF 80...BF
197
   * U+10000...U+3FFFF   F0      90...BF 80...BF 80...BF
198
   * U+40000...U+FFFFF   F1...F3 80...BF 80...BF 80...BF
199
   * U+100000...U+10FFFF F4      80...8F 80...BF 80...BF
200
   */
201
202
  /* First we compute the type for each byte, as given by the table below.
203
   * This type will be used as an index later on.
204
   */
205
206
  /*
207
   * Index  Min Max Byte Type
208
   *  0     00  7F  Single byte sequence
209
   *  1,2,3 80  BF  Second, third and fourth byte for many of the sequences.
210
   *  4     A0  BF  Second byte after E0
211
   *  5     80  9F  Second byte after ED
212
   *  6     90  BF  Second byte after F0
213
   *  7     80  8F  Second byte after F4
214
   *  8     C2  F4  First non ASCII byte
215
   *  9..15 7F  80  Invalid byte
216
   */
217
218
  /* After the first step we compute the index for all bytes, then we permute
219
     the bytes according to their indices to check the ranges from the range
220
     table.
221
   * The range for a given type can be found in the range_min_table and
222
     range_max_table, the range for type/index X is in range_min_table[X] ...
223
     range_max_table[X].
224
   */
225
226
  /* Algorithm:
227
   * Put index zero to all bytes.
228
   * Find all non ASCII characters, give them index 8.
229
   * For each tail byte in a codepoint sequence, give it an index corresponding
230
     to the 1 based index from the end.
231
   * If the first byte of the codepoint is in the [C0...DF] range, we write
232
     index 1 in the following byte.
233
   * If the first byte of the codepoint is in the range [E0...EF], we write
234
     indices 2 and 1 in the next two bytes.
235
   * If the first byte of the codepoint is in the range [F0...FF] we write
236
     indices 3,2,1 into the next three bytes.
237
   * For finding the number of bytes we need to look at high nibbles (4 bits)
238
     and do the lookup from the table, it can be done with shift by 4 + shuffle
239
     instructions. We call it `first_len`.
240
   * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
241
   * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
242
   * Again to get the indices of the 4th bytes.
243
   * Take OR of all that 4 values and check within range.
244
   */
245
  /* For example:
246
   * input       C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
247
   * first_len   1  0  0  2  0  0  0  3  0  0  0  0  3  0  0  0
248
   * 1st byte    8  0  0  8  0  0  0  8  0  0  0  0  8  0  0  0
249
   * 2nd byte    0  1  0  0  2  0  0  0  3  0  0  0  0  3  0  0 // Shift + sub
250
   * 3rd byte    0  0  0  0  0  1  0  0  0  2  0  0  0  0  2  0 // Shift + sub
251
   * 4th byte    0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  1 // Shift + sub
252
   * Index       8  1  0  8  2  1  0  8  3  2  1  0  8  3  2  1 // OR of results
253
   */
254
255
  /* Checking for errors:
256
   * Error checking is done by looking up the high nibble (4 bits) of each byte
257
     against an error checking table.
258
   * Because the lookup value for the second byte depends of the value of the
259
     first byte in codepoint, we use saturated operations to adjust the index.
260
   * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
261
     match the correct index.
262
       * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
263
         F4 -> 5
264
       * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
265
         match the adjustment
266
       * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
267
         be more 128 and lookup in ef_fe_table will return 0 but for F0
268
         and F4 it will be 4 and 5 accordingly
269
   */
270
  /*
271
   * Then just check the appropriate ranges with greater/smaller equal
272
     instructions. Check tail with a naive algorithm.
273
   * To save from previous 16 byte checks we just align previous_first_len to
274
     get correct continuations of the codepoints.
275
   */
276
277
  /*
278
   * Map high nibble of "First Byte" to legal character length minus 1
279
   * 0x00 ~ 0xBF --> 0
280
   * 0xC0 ~ 0xDF --> 1
281
   * 0xE0 ~ 0xEF --> 2
282
   * 0xF0 ~ 0xFF --> 3
283
   */
284
  const __m128i first_len_table =
285
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
286
287
  /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
288
  const __m128i first_range_table =
289
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
290
291
  /*
292
   * Range table, map range index to min and max values
293
   */
294
  const __m128i range_min_table =
295
      _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
296
                    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
297
298
  const __m128i range_max_table =
299
      _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
300
                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
301
302
  /*
303
   * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
304
   * which the Second Byte are not 80~BF. It contains "range index adjustment".
305
   * +------------+---------------+------------------+----------------+
306
   * | First Byte | original range| range adjustment | adjusted range |
307
   * +------------+---------------+------------------+----------------+
308
   * | E0         | 2             | 2                | 4              |
309
   * +------------+---------------+------------------+----------------+
310
   * | ED         | 2             | 3                | 5              |
311
   * +------------+---------------+------------------+----------------+
312
   * | F0         | 3             | 3                | 6              |
313
   * +------------+---------------+------------------+----------------+
314
   * | F4         | 4             | 4                | 8              |
315
   * +------------+---------------+------------------+----------------+
316
   */
317
318
  /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
319
  // The values represent the adjustment in the Range Index table for a correct
320
  // index.
321
  const __m128i df_ee_table =
322
      _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
323
324
  /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
325
  // The values represent the adjustment in the Range Index table for a correct
326
  // index.
327
  const __m128i ef_fe_table =
328
      _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
329
330
  __m128i prev_input = _mm_set1_epi8(0);
331
  __m128i prev_first_len = _mm_set1_epi8(0);
332
  __m128i error = _mm_set1_epi8(0);
333
  while (end - data >= 16) {
334
    const __m128i input =
335
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
336
337
    /* high_nibbles = input >> 4 */
338
    const __m128i high_nibbles =
339
        _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
340
341
    /* first_len = legal character length minus 1 */
342
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
343
    /* first_len = first_len_table[high_nibbles] */
344
    __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
345
346
    /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
347
    /* range = first_range_table[high_nibbles] */
348
    __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
349
350
    /* Second Byte: set range index to first_len */
351
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
352
    /* range |= (first_len, prev_first_len) << 1 byte */
353
    range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
354
355
    /* Third Byte: set range index to saturate_sub(first_len, 1) */
356
    /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
357
    __m128i tmp1;
358
    __m128i tmp2;
359
    /* tmp1 = saturate_sub(first_len, 1) */
360
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
361
    /* tmp2 = saturate_sub(prev_first_len, 1) */
362
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
363
    /* range |= (tmp1, tmp2) << 2 bytes */
364
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
365
366
    /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
367
    /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
368
    /* tmp1 = saturate_sub(first_len, 2) */
369
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
370
    /* tmp2 = saturate_sub(prev_first_len, 2) */
371
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
372
    /* range |= (tmp1, tmp2) << 3 bytes */
373
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
374
375
    /*
376
     * Now we have below range indices calculated
377
     * Correct cases:
378
     * - 8 for C0~FF
379
     * - 3 for 1st byte after F0~FF
380
     * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
381
     * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
382
     *         3rd byte after F0~FF
383
     * - 0 for others
384
     * Error cases:
385
     *   >9 for non ascii First Byte overlapping
386
     *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
387
     */
388
389
    /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
390
    /* Overlaps lead to index 9~15, which are illegal in range table */
391
    __m128i shift1;
392
    __m128i pos;
393
    __m128i range2;
394
    /* shift1 = (input, prev_input) << 1 byte */
395
    shift1 = _mm_alignr_epi8(input, prev_input, 15);
396
    pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
397
    /*
398
     * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
399
     * pos:     | 0   1      15 | 16  17           239| 240 241    255|
400
     * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
401
     * pos+112: | 112 113    127|       >= 128        |     >= 128    |
402
     */
403
    tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
404
    range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
405
    tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
406
    range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
407
408
    range = _mm_add_epi8(range, range2);
409
410
    /* Load min and max values per calculated range index */
411
    __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
412
    __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
413
414
    /* Check value range */
415
    if (ReturnPosition) {
416
      error = _mm_cmplt_epi8(input, min_range);
417
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
418
      /* 5% performance drop from this conditional branch */
419
      if (!_mm_testz_si128(error, error)) {
420
        break;
421
      }
422
    } else {
423
      error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
424
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
425
    }
426
427
    prev_input = input;
428
    prev_first_len = first_len;
429
430
    data += 16;
431
  }
432
  /* If we got to the end, we don't need to skip any bytes backwards */
433
  if (ReturnPosition && (data - (end - len)) == 0) {
434
    return ValidUTF8Span<true>(data, end);
435
  }
436
  /* Find previous codepoint (not 80~BF) */
437
  data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
438
  if (ReturnPosition) {
439
    return (data - (end - len)) + ValidUTF8Span<true>(data, end);
440
  }
441
  /* Test if there was any error */
442
  if (!_mm_testz_si128(error, error)) {
443
    return 0;
444
  }
445
  /* Check the tail */
446
  return ValidUTF8Span<false>(data, end);
447
#endif
448
11.6M
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8<true>(char const*, unsigned long)
Line
Count
Source
169
52.6k
size_t ValidUTF8(const char* data, size_t len) {
170
52.6k
  if (len == 0) return 1 - ReturnPosition;
171
52.6k
  const char* const end = data + len;
172
52.6k
  data = SkipAscii(data, end);
173
  /* SIMD algorithm always outperforms the naive version for any data of
174
     length >=16.
175
   */
176
52.6k
  if (end - data < 16) {
177
49.3k
    return (ReturnPosition ? (data - (end - len)) : 0) +
178
49.3k
           ValidUTF8Span<ReturnPosition>(data, end);
179
49.3k
  }
180
3.29k
#ifndef __SSE4_1__
181
3.29k
  return (ReturnPosition ? (data - (end - len)) : 0) +
182
3.29k
         ValidUTF8Span<ReturnPosition>(data, end);
183
#else
184
  /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
185
   * using superscalar instructions.
186
   * The mapping between ranges of codepoint and their corresponding utf-8
187
   * sequences is below.
188
   */
189
190
  /*
191
   * U+0000...U+007F     00...7F
192
   * U+0080...U+07FF     C2...DF 80...BF
193
   * U+0800...U+0FFF     E0      A0...BF 80...BF
194
   * U+1000...U+CFFF     E1...EC 80...BF 80...BF
195
   * U+D000...U+D7FF     ED      80...9F 80...BF
196
   * U+E000...U+FFFF     EE...EF 80...BF 80...BF
197
   * U+10000...U+3FFFF   F0      90...BF 80...BF 80...BF
198
   * U+40000...U+FFFFF   F1...F3 80...BF 80...BF 80...BF
199
   * U+100000...U+10FFFF F4      80...8F 80...BF 80...BF
200
   */
201
202
  /* First we compute the type for each byte, as given by the table below.
203
   * This type will be used as an index later on.
204
   */
205
206
  /*
207
   * Index  Min Max Byte Type
208
   *  0     00  7F  Single byte sequence
209
   *  1,2,3 80  BF  Second, third and fourth byte for many of the sequences.
210
   *  4     A0  BF  Second byte after E0
211
   *  5     80  9F  Second byte after ED
212
   *  6     90  BF  Second byte after F0
213
   *  7     80  8F  Second byte after F4
214
   *  8     C2  F4  First non ASCII byte
215
   *  9..15 7F  80  Invalid byte
216
   */
217
218
  /* After the first step we compute the index for all bytes, then we permute
219
     the bytes according to their indices to check the ranges from the range
220
     table.
221
   * The range for a given type can be found in the range_min_table and
222
     range_max_table, the range for type/index X is in range_min_table[X] ...
223
     range_max_table[X].
224
   */
225
226
  /* Algorithm:
227
   * Put index zero to all bytes.
228
   * Find all non ASCII characters, give them index 8.
229
   * For each tail byte in a codepoint sequence, give it an index corresponding
230
     to the 1 based index from the end.
231
   * If the first byte of the codepoint is in the [C0...DF] range, we write
232
     index 1 in the following byte.
233
   * If the first byte of the codepoint is in the range [E0...EF], we write
234
     indices 2 and 1 in the next two bytes.
235
   * If the first byte of the codepoint is in the range [F0...FF] we write
236
     indices 3,2,1 into the next three bytes.
237
   * For finding the number of bytes we need to look at high nibbles (4 bits)
238
     and do the lookup from the table, it can be done with shift by 4 + shuffle
239
     instructions. We call it `first_len`.
240
   * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
241
   * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
242
   * Again to get the indices of the 4th bytes.
243
   * Take OR of all that 4 values and check within range.
244
   */
245
  /* For example:
246
   * input       C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
247
   * first_len   1  0  0  2  0  0  0  3  0  0  0  0  3  0  0  0
248
   * 1st byte    8  0  0  8  0  0  0  8  0  0  0  0  8  0  0  0
249
   * 2nd byte    0  1  0  0  2  0  0  0  3  0  0  0  0  3  0  0 // Shift + sub
250
   * 3rd byte    0  0  0  0  0  1  0  0  0  2  0  0  0  0  2  0 // Shift + sub
251
   * 4th byte    0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  1 // Shift + sub
252
   * Index       8  1  0  8  2  1  0  8  3  2  1  0  8  3  2  1 // OR of results
253
   */
254
255
  /* Checking for errors:
256
   * Error checking is done by looking up the high nibble (4 bits) of each byte
257
     against an error checking table.
258
   * Because the lookup value for the second byte depends of the value of the
259
     first byte in codepoint, we use saturated operations to adjust the index.
260
   * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
261
     match the correct index.
262
       * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
263
         F4 -> 5
264
       * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
265
         match the adjustment
266
       * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
267
         be more 128 and lookup in ef_fe_table will return 0 but for F0
268
         and F4 it will be 4 and 5 accordingly
269
   */
270
  /*
271
   * Then just check the appropriate ranges with greater/smaller equal
272
     instructions. Check tail with a naive algorithm.
273
   * To save from previous 16 byte checks we just align previous_first_len to
274
     get correct continuations of the codepoints.
275
   */
276
277
  /*
278
   * Map high nibble of "First Byte" to legal character length minus 1
279
   * 0x00 ~ 0xBF --> 0
280
   * 0xC0 ~ 0xDF --> 1
281
   * 0xE0 ~ 0xEF --> 2
282
   * 0xF0 ~ 0xFF --> 3
283
   */
284
  const __m128i first_len_table =
285
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
286
287
  /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
288
  const __m128i first_range_table =
289
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
290
291
  /*
292
   * Range table, map range index to min and max values
293
   */
294
  const __m128i range_min_table =
295
      _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
296
                    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
297
298
  const __m128i range_max_table =
299
      _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
300
                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
301
302
  /*
303
   * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
304
   * which the Second Byte are not 80~BF. It contains "range index adjustment".
305
   * +------------+---------------+------------------+----------------+
306
   * | First Byte | original range| range adjustment | adjusted range |
307
   * +------------+---------------+------------------+----------------+
308
   * | E0         | 2             | 2                | 4              |
309
   * +------------+---------------+------------------+----------------+
310
   * | ED         | 2             | 3                | 5              |
311
   * +------------+---------------+------------------+----------------+
312
   * | F0         | 3             | 3                | 6              |
313
   * +------------+---------------+------------------+----------------+
314
   * | F4         | 4             | 4                | 8              |
315
   * +------------+---------------+------------------+----------------+
316
   */
317
318
  /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
319
  // The values represent the adjustment in the Range Index table for a correct
320
  // index.
321
  const __m128i df_ee_table =
322
      _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
323
324
  /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
325
  // The values represent the adjustment in the Range Index table for a correct
326
  // index.
327
  const __m128i ef_fe_table =
328
      _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
329
330
  __m128i prev_input = _mm_set1_epi8(0);
331
  __m128i prev_first_len = _mm_set1_epi8(0);
332
  __m128i error = _mm_set1_epi8(0);
333
  while (end - data >= 16) {
334
    const __m128i input =
335
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
336
337
    /* high_nibbles = input >> 4 */
338
    const __m128i high_nibbles =
339
        _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
340
341
    /* first_len = legal character length minus 1 */
342
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
343
    /* first_len = first_len_table[high_nibbles] */
344
    __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
345
346
    /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
347
    /* range = first_range_table[high_nibbles] */
348
    __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
349
350
    /* Second Byte: set range index to first_len */
351
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
352
    /* range |= (first_len, prev_first_len) << 1 byte */
353
    range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
354
355
    /* Third Byte: set range index to saturate_sub(first_len, 1) */
356
    /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
357
    __m128i tmp1;
358
    __m128i tmp2;
359
    /* tmp1 = saturate_sub(first_len, 1) */
360
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
361
    /* tmp2 = saturate_sub(prev_first_len, 1) */
362
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
363
    /* range |= (tmp1, tmp2) << 2 bytes */
364
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
365
366
    /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
367
    /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
368
    /* tmp1 = saturate_sub(first_len, 2) */
369
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
370
    /* tmp2 = saturate_sub(prev_first_len, 2) */
371
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
372
    /* range |= (tmp1, tmp2) << 3 bytes */
373
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
374
375
    /*
376
     * Now we have below range indices calculated
377
     * Correct cases:
378
     * - 8 for C0~FF
379
     * - 3 for 1st byte after F0~FF
380
     * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
381
     * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
382
     *         3rd byte after F0~FF
383
     * - 0 for others
384
     * Error cases:
385
     *   >9 for non ascii First Byte overlapping
386
     *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
387
     */
388
389
    /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
390
    /* Overlaps lead to index 9~15, which are illegal in range table */
391
    __m128i shift1;
392
    __m128i pos;
393
    __m128i range2;
394
    /* shift1 = (input, prev_input) << 1 byte */
395
    shift1 = _mm_alignr_epi8(input, prev_input, 15);
396
    pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
397
    /*
398
     * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
399
     * pos:     | 0   1      15 | 16  17           239| 240 241    255|
400
     * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
401
     * pos+112: | 112 113    127|       >= 128        |     >= 128    |
402
     */
403
    tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
404
    range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
405
    tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
406
    range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
407
408
    range = _mm_add_epi8(range, range2);
409
410
    /* Load min and max values per calculated range index */
411
    __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
412
    __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
413
414
    /* Check value range */
415
    if (ReturnPosition) {
416
      error = _mm_cmplt_epi8(input, min_range);
417
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
418
      /* 5% performance drop from this conditional branch */
419
      if (!_mm_testz_si128(error, error)) {
420
        break;
421
      }
422
    } else {
423
      error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
424
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
425
    }
426
427
    prev_input = input;
428
    prev_first_len = first_len;
429
430
    data += 16;
431
  }
432
  /* If we got to the end, we don't need to skip any bytes backwards */
433
  if (ReturnPosition && (data - (end - len)) == 0) {
434
    return ValidUTF8Span<true>(data, end);
435
  }
436
  /* Find previous codepoint (not 80~BF) */
437
  data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
438
  if (ReturnPosition) {
439
    return (data - (end - len)) + ValidUTF8Span<true>(data, end);
440
  }
441
  /* Test if there was any error */
442
  if (!_mm_testz_si128(error, error)) {
443
    return 0;
444
  }
445
  /* Check the tail */
446
  return ValidUTF8Span<false>(data, end);
447
#endif
448
52.6k
}
449
450
}  // namespace
451
452
12.0M
bool IsStructurallyValid(absl::string_view str) {
453
12.0M
  return ValidUTF8</*ReturnPosition=*/false>(str.data(), str.size());
454
12.0M
}
455
456
52.6k
size_t SpanStructurallyValid(absl::string_view str) {
457
52.6k
  return ValidUTF8</*ReturnPosition=*/true>(str.data(), str.size());
458
52.6k
}
459
460
}  // namespace utf8_range