Coverage Report

Created: 2025-02-02 06:38

/proc/self/cwd/external/utf8_range/utf8_validity.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2022 Google LLC
2
//
3
// Use of this source code is governed by an MIT-style
4
// license that can be found in the LICENSE file or at
5
// https://opensource.org/licenses/MIT.
6
7
/* This is a wrapper for the Google range-sse.cc algorithm which checks whether a
8
 * sequence of bytes is a valid UTF-8 sequence and finds the longest valid prefix of
9
 * the UTF-8 sequence.
10
 *
11
 * The key difference is that it checks for as much ASCII symbols as possible
12
 * and then falls back to the range-sse.cc algorithm. The changes to the
13
 * algorithm are cosmetic, mostly to trick the clang compiler to produce optimal
14
 * code.
15
 *
16
 * For API see the utf8_validity.h header.
17
 */
18
#include "utf8_validity.h"
19
20
#include <cstddef>
21
#include <cstdint>
22
23
#include "absl/strings/ascii.h"
24
#include "absl/strings/string_view.h"
25
26
#ifdef __SSE4_1__
27
#include <emmintrin.h>
28
#include <smmintrin.h>
29
#include <tmmintrin.h>
30
#endif
31
32
namespace utf8_range {
33
namespace {
34
35
1.35k
inline uint64_t UNALIGNED_LOAD64(const void* p) {
36
1.35k
  uint64_t t;
37
1.35k
  memcpy(&t, p, sizeof t);
38
1.35k
  return t;
39
1.35k
}
40
41
0
inline bool TrailByteOk(const char c) {
42
0
  return static_cast<int8_t>(c) <= static_cast<int8_t>(0xBF);
43
0
}
44
45
/* If ReturnPosition is false then it returns 1 if |data| is a valid utf8
46
 * sequence, otherwise returns 0.
47
 * If ReturnPosition is set to true, returns the length in bytes of the prefix
48
   of |data| that is all structurally valid UTF-8.
49
 */
50
template <bool ReturnPosition>
51
958
size_t ValidUTF8Span(const char* data, const char* end) {
52
  /* We return err_pos in the loop which is always 0 if !ReturnPosition */
53
958
  size_t err_pos = 0;
54
958
  size_t codepoint_bytes = 0;
55
  /* The early check is done because of early continue's on codepoints of all
56
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
57
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
58
   * improve readability of the codepoint validity check.
59
   */
60
958
  while (data + codepoint_bytes < end) {
61
0
    if (ReturnPosition) {
62
0
      err_pos += codepoint_bytes;
63
0
    }
64
0
    data += codepoint_bytes;
65
0
    const size_t len = end - data;
66
0
    const unsigned char byte1 = data[0];
67
68
    /* We do not skip many ascii bytes at the same time as this function is
69
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
70
       don't think that cases where non-ASCII codepoints are followed by ascii
71
       happen often. For small strings it also introduces some penalty. For
72
       purely ascii UTF8 strings (which is the overwhelming case) we call
73
       SkipAscii function which is multiplatform and extremely fast.
74
     */
75
    /* [00..7F] ASCII -> 1 byte */
76
0
    if (absl::ascii_isascii(byte1)) {
77
0
      codepoint_bytes = 1;
78
0
      continue;
79
0
    }
80
    /* [C2..DF], [80..BF] -> 2 bytes */
81
0
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
82
0
      codepoint_bytes = 2;
83
0
      continue;
84
0
    }
85
0
    if (len >= 3) {
86
0
      const unsigned char byte2 = data[1];
87
0
      const unsigned char byte3 = data[2];
88
89
      /* Is byte2, byte3 between [0x80, 0xBF]
90
       * Check for 0x80 was done above.
91
       */
92
0
      if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
93
0
        return err_pos;
94
0
      }
95
96
0
      if (/* E0, A0..BF, 80..BF */
97
0
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
98
           /* E1..EC, 80..BF, 80..BF */
99
0
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
100
           /* ED, 80..9F, 80..BF */
101
0
           (byte1 == 0xED && byte2 <= 0x9F) ||
102
           /* EE..EF, 80..BF, 80..BF */
103
0
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
104
0
        codepoint_bytes = 3;
105
0
        continue;
106
0
      }
107
0
      if (len >= 4) {
108
0
        const unsigned char byte4 = data[3];
109
        /* Is byte4 between 0x80 ~ 0xBF */
110
0
        if (!TrailByteOk(byte4)) {
111
0
          return err_pos;
112
0
        }
113
114
0
        if (/* F0, 90..BF, 80..BF, 80..BF */
115
0
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
116
             /* F1..F3, 80..BF, 80..BF, 80..BF */
117
0
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
118
             /* F4, 80..8F, 80..BF, 80..BF */
119
0
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
120
0
          codepoint_bytes = 4;
121
0
          continue;
122
0
        }
123
0
      }
124
0
    }
125
0
    return err_pos;
126
0
  }
127
958
  if (ReturnPosition) {
128
0
    err_pos += codepoint_bytes;
129
0
  }
130
  /* if ReturnPosition is false, this returns 1.
131
   * if ReturnPosition is true, this returns err_pos.
132
   */
133
958
  return err_pos + (1 - ReturnPosition);
134
958
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8Span<false>(char const*, char const*)
Line
Count
Source
51
958
size_t ValidUTF8Span(const char* data, const char* end) {
52
  /* We return err_pos in the loop which is always 0 if !ReturnPosition */
53
958
  size_t err_pos = 0;
54
958
  size_t codepoint_bytes = 0;
55
  /* The early check is done because of early continue's on codepoints of all
56
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
57
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
58
   * improve readability of the codepoint validity check.
59
   */
60
958
  while (data + codepoint_bytes < end) {
61
0
    if (ReturnPosition) {
62
0
      err_pos += codepoint_bytes;
63
0
    }
64
0
    data += codepoint_bytes;
65
0
    const size_t len = end - data;
66
0
    const unsigned char byte1 = data[0];
67
68
    /* We do not skip many ascii bytes at the same time as this function is
69
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
70
       don't think that cases where non-ASCII codepoints are followed by ascii
71
       happen often. For small strings it also introduces some penalty. For
72
       purely ascii UTF8 strings (which is the overwhelming case) we call
73
       SkipAscii function which is multiplatform and extremely fast.
74
     */
75
    /* [00..7F] ASCII -> 1 byte */
76
0
    if (absl::ascii_isascii(byte1)) {
77
0
      codepoint_bytes = 1;
78
0
      continue;
79
0
    }
80
    /* [C2..DF], [80..BF] -> 2 bytes */
81
0
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
82
0
      codepoint_bytes = 2;
83
0
      continue;
84
0
    }
85
0
    if (len >= 3) {
86
0
      const unsigned char byte2 = data[1];
87
0
      const unsigned char byte3 = data[2];
88
89
      /* Is byte2, byte3 between [0x80, 0xBF]
90
       * Check for 0x80 was done above.
91
       */
92
0
      if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
93
0
        return err_pos;
94
0
      }
95
96
0
      if (/* E0, A0..BF, 80..BF */
97
0
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
98
           /* E1..EC, 80..BF, 80..BF */
99
0
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
100
           /* ED, 80..9F, 80..BF */
101
0
           (byte1 == 0xED && byte2 <= 0x9F) ||
102
           /* EE..EF, 80..BF, 80..BF */
103
0
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
104
0
        codepoint_bytes = 3;
105
0
        continue;
106
0
      }
107
0
      if (len >= 4) {
108
0
        const unsigned char byte4 = data[3];
109
        /* Is byte4 between 0x80 ~ 0xBF */
110
0
        if (!TrailByteOk(byte4)) {
111
0
          return err_pos;
112
0
        }
113
114
0
        if (/* F0, 90..BF, 80..BF, 80..BF */
115
0
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
116
             /* F1..F3, 80..BF, 80..BF, 80..BF */
117
0
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
118
             /* F4, 80..8F, 80..BF, 80..BF */
119
0
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
120
0
          codepoint_bytes = 4;
121
0
          continue;
122
0
        }
123
0
      }
124
0
    }
125
0
    return err_pos;
126
0
  }
127
958
  if (ReturnPosition) {
128
0
    err_pos += codepoint_bytes;
129
0
  }
130
  /* if ReturnPosition is false, this returns 1.
131
   * if ReturnPosition is true, this returns err_pos.
132
   */
133
958
  return err_pos + (1 - ReturnPosition);
134
958
}
Unexecuted instantiation: utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8Span<true>(char const*, char const*)
135
136
/* Returns the number of bytes needed to skip backwards to get to the first
137
   byte of codepoint.
138
 */
139
0
inline int CodepointSkipBackwards(int32_t codepoint_word) {
140
0
  const int8_t* const codepoint =
141
0
      reinterpret_cast<const int8_t*>(&codepoint_word);
142
0
  if (!TrailByteOk(codepoint[3])) {
143
0
    return 1;
144
0
  } else if (!TrailByteOk(codepoint[2])) {
145
0
    return 2;
146
0
  } else if (!TrailByteOk(codepoint[1])) {
147
0
    return 3;
148
0
  }
149
0
  return 0;
150
0
}
151
152
/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional
153
   as most strings to check for validity consist only of 1 byte codepoints.
154
 */
155
958
inline const char* SkipAscii(const char* data, const char* end) {
156
2.31k
  while (8 <= end - data &&
157
2.31k
         (UNALIGNED_LOAD64(data) & 0x8080808080808080) == 0) {
158
1.35k
    data += 8;
159
1.35k
  }
160
4.64k
  while (data < end && absl::ascii_isascii(*data)) {
161
3.69k
    ++data;
162
3.69k
  }
163
958
  return data;
164
958
}
165
166
template <bool ReturnPosition>
167
958
size_t ValidUTF8(const char* data, size_t len) {
168
958
  if (len == 0) return 1 - ReturnPosition;
169
958
  const char* const end = data + len;
170
958
  data = SkipAscii(data, end);
171
  /* SIMD algorithm always outperforms the naive version for any data of
172
     length >=16.
173
   */
174
958
  if (end - data < 16) {
175
958
    return (ReturnPosition ? (data - (end - len)) : 0) +
176
958
           ValidUTF8Span<ReturnPosition>(data, end);
177
958
  }
178
0
#ifndef __SSE4_1__
179
0
  return (ReturnPosition ? (data - (end - len)) : 0) +
180
0
         ValidUTF8Span<ReturnPosition>(data, end);
181
#else
182
  /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
183
   * using superscalar instructions.
184
   * The mapping between ranges of codepoint and their corresponding utf-8
185
   * sequences is below.
186
   */
187
188
  /*
189
   * U+0000...U+007F     00...7F
190
   * U+0080...U+07FF     C2...DF 80...BF
191
   * U+0800...U+0FFF     E0      A0...BF 80...BF
192
   * U+1000...U+CFFF     E1...EC 80...BF 80...BF
193
   * U+D000...U+D7FF     ED      80...9F 80...BF
194
   * U+E000...U+FFFF     EE...EF 80...BF 80...BF
195
   * U+10000...U+3FFFF   F0      90...BF 80...BF 80...BF
196
   * U+40000...U+FFFFF   F1...F3 80...BF 80...BF 80...BF
197
   * U+100000...U+10FFFF F4      80...8F 80...BF 80...BF
198
   */
199
200
  /* First we compute the type for each byte, as given by the table below.
201
   * This type will be used as an index later on.
202
   */
203
204
  /*
205
   * Index  Min Max Byte Type
206
   *  0     00  7F  Single byte sequence
207
   *  1,2,3 80  BF  Second, third and fourth byte for many of the sequences.
208
   *  4     A0  BF  Second byte after E0
209
   *  5     80  9F  Second byte after ED
210
   *  6     90  BF  Second byte after F0
211
   *  7     80  8F  Second byte after F4
212
   *  8     C2  F4  First non ASCII byte
213
   *  9..15 7F  80  Invalid byte
214
   */
215
216
  /* After the first step we compute the index for all bytes, then we permute
217
     the bytes according to their indices to check the ranges from the range
218
     table.
219
   * The range for a given type can be found in the range_min_table and
220
     range_max_table, the range for type/index X is in range_min_table[X] ...
221
     range_max_table[X].
222
   */
223
224
  /* Algorithm:
225
   * Put index zero to all bytes.
226
   * Find all non ASCII characters, give them index 8.
227
   * For each tail byte in a codepoint sequence, give it an index corresponding
228
     to the 1 based index from the end.
229
   * If the first byte of the codepoint is in the [C0...DF] range, we write
230
     index 1 in the following byte.
231
   * If the first byte of the codepoint is in the range [E0...EF], we write
232
     indices 2 and 1 in the next two bytes.
233
   * If the first byte of the codepoint is in the range [F0...FF] we write
234
     indices 3,2,1 into the next three bytes.
235
   * For finding the number of bytes we need to look at high nibbles (4 bits)
236
     and do the lookup from the table, it can be done with shift by 4 + shuffle
237
     instructions. We call it `first_len`.
238
   * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
239
   * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
240
   * Again to get the indices of the 4th bytes.
241
   * Take OR of all that 4 values and check within range.
242
   */
243
  /* For example:
244
   * input       C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
245
   * first_len   1  0  0  2  0  0  0  3  0  0  0  0  3  0  0  0
246
   * 1st byte    8  0  0  8  0  0  0  8  0  0  0  0  8  0  0  0
247
   * 2nd byte    0  1  0  0  2  0  0  0  3  0  0  0  0  3  0  0 // Shift + sub
248
   * 3rd byte    0  0  0  0  0  1  0  0  0  2  0  0  0  0  2  0 // Shift + sub
249
   * 4th byte    0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  1 // Shift + sub
250
   * Index       8  1  0  8  2  1  0  8  3  2  1  0  8  3  2  1 // OR of results
251
   */
252
253
  /* Checking for errors:
254
   * Error checking is done by looking up the high nibble (4 bits) of each byte
255
     against an error checking table.
256
   * Because the lookup value for the second byte depends of the value of the
257
     first byte in codepoint, we use saturated operations to adjust the index.
258
   * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
259
     match the correct index.
260
       * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
261
         F4 -> 5
262
       * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
263
         match the adjustment
264
       * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
265
         be more 128 and lookup in ef_fe_table will return 0 but for F0
266
         and F4 it will be 4 and 5 accordingly
267
   */
268
  /*
269
   * Then just check the appropriate ranges with greater/smaller equal
270
     instructions. Check tail with a naive algorithm.
271
   * To save from previous 16 byte checks we just align previous_first_len to
272
     get correct continuations of the codepoints.
273
   */
274
275
  /*
276
   * Map high nibble of "First Byte" to legal character length minus 1
277
   * 0x00 ~ 0xBF --> 0
278
   * 0xC0 ~ 0xDF --> 1
279
   * 0xE0 ~ 0xEF --> 2
280
   * 0xF0 ~ 0xFF --> 3
281
   */
282
  const __m128i first_len_table =
283
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
284
285
  /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
286
  const __m128i first_range_table =
287
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
288
289
  /*
290
   * Range table, map range index to min and max values
291
   */
292
  const __m128i range_min_table =
293
      _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
294
                    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
295
296
  const __m128i range_max_table =
297
      _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
298
                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
299
300
  /*
301
   * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
302
   * which the Second Byte are not 80~BF. It contains "range index adjustment".
303
   * +------------+---------------+------------------+----------------+
304
   * | First Byte | original range| range adjustment | adjusted range |
305
   * +------------+---------------+------------------+----------------+
306
   * | E0         | 2             | 2                | 4              |
307
   * +------------+---------------+------------------+----------------+
308
   * | ED         | 2             | 3                | 5              |
309
   * +------------+---------------+------------------+----------------+
310
   * | F0         | 3             | 3                | 6              |
311
   * +------------+---------------+------------------+----------------+
312
   * | F4         | 4             | 4                | 8              |
313
   * +------------+---------------+------------------+----------------+
314
   */
315
316
  /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
317
  // The values represent the adjustment in the Range Index table for a correct
318
  // index.
319
  const __m128i df_ee_table =
320
      _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
321
322
  /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
323
  // The values represent the adjustment in the Range Index table for a correct
324
  // index.
325
  const __m128i ef_fe_table =
326
      _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
327
328
  __m128i prev_input = _mm_set1_epi8(0);
329
  __m128i prev_first_len = _mm_set1_epi8(0);
330
  __m128i error = _mm_set1_epi8(0);
331
  while (end - data >= 16) {
332
    const __m128i input =
333
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
334
335
    /* high_nibbles = input >> 4 */
336
    const __m128i high_nibbles =
337
        _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
338
339
    /* first_len = legal character length minus 1 */
340
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
341
    /* first_len = first_len_table[high_nibbles] */
342
    __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
343
344
    /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
345
    /* range = first_range_table[high_nibbles] */
346
    __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
347
348
    /* Second Byte: set range index to first_len */
349
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
350
    /* range |= (first_len, prev_first_len) << 1 byte */
351
    range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
352
353
    /* Third Byte: set range index to saturate_sub(first_len, 1) */
354
    /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
355
    __m128i tmp1;
356
    __m128i tmp2;
357
    /* tmp1 = saturate_sub(first_len, 1) */
358
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
359
    /* tmp2 = saturate_sub(prev_first_len, 1) */
360
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
361
    /* range |= (tmp1, tmp2) << 2 bytes */
362
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
363
364
    /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
365
    /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
366
    /* tmp1 = saturate_sub(first_len, 2) */
367
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
368
    /* tmp2 = saturate_sub(prev_first_len, 2) */
369
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
370
    /* range |= (tmp1, tmp2) << 3 bytes */
371
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
372
373
    /*
374
     * Now we have below range indices calculated
375
     * Correct cases:
376
     * - 8 for C0~FF
377
     * - 3 for 1st byte after F0~FF
378
     * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
379
     * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
380
     *         3rd byte after F0~FF
381
     * - 0 for others
382
     * Error cases:
383
     *   >9 for non ascii First Byte overlapping
384
     *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
385
     */
386
387
    /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
388
    /* Overlaps lead to index 9~15, which are illegal in range table */
389
    __m128i shift1;
390
    __m128i pos;
391
    __m128i range2;
392
    /* shift1 = (input, prev_input) << 1 byte */
393
    shift1 = _mm_alignr_epi8(input, prev_input, 15);
394
    pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
395
    /*
396
     * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
397
     * pos:     | 0   1      15 | 16  17           239| 240 241    255|
398
     * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
399
     * pos+112: | 112 113    127|       >= 128        |     >= 128    |
400
     */
401
    tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
402
    range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
403
    tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
404
    range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
405
406
    range = _mm_add_epi8(range, range2);
407
408
    /* Load min and max values per calculated range index */
409
    __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
410
    __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
411
412
    /* Check value range */
413
    if (ReturnPosition) {
414
      error = _mm_cmplt_epi8(input, min_range);
415
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
416
      /* 5% performance drop from this conditional branch */
417
      if (!_mm_testz_si128(error, error)) {
418
        break;
419
      }
420
    } else {
421
      error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
422
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
423
    }
424
425
    prev_input = input;
426
    prev_first_len = first_len;
427
428
    data += 16;
429
  }
430
  /* If we got to the end, we don't need to skip any bytes backwards */
431
  if (ReturnPosition && (data - (end - len)) == 0) {
432
    return ValidUTF8Span<true>(data, end);
433
  }
434
  /* Find previous codepoint (not 80~BF) */
435
  data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
436
  if (ReturnPosition) {
437
    return (data - (end - len)) + ValidUTF8Span<true>(data, end);
438
  }
439
  /* Test if there was any error */
440
  if (!_mm_testz_si128(error, error)) {
441
    return 0;
442
  }
443
  /* Check the tail */
444
  return ValidUTF8Span<false>(data, end);
445
#endif
446
958
}
utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8<false>(char const*, unsigned long)
Line
Count
Source
167
958
size_t ValidUTF8(const char* data, size_t len) {
168
958
  if (len == 0) return 1 - ReturnPosition;
169
958
  const char* const end = data + len;
170
958
  data = SkipAscii(data, end);
171
  /* SIMD algorithm always outperforms the naive version for any data of
172
     length >=16.
173
   */
174
958
  if (end - data < 16) {
175
958
    return (ReturnPosition ? (data - (end - len)) : 0) +
176
958
           ValidUTF8Span<ReturnPosition>(data, end);
177
958
  }
178
0
#ifndef __SSE4_1__
179
0
  return (ReturnPosition ? (data - (end - len)) : 0) +
180
0
         ValidUTF8Span<ReturnPosition>(data, end);
181
#else
182
  /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
183
   * using superscalar instructions.
184
   * The mapping between ranges of codepoint and their corresponding utf-8
185
   * sequences is below.
186
   */
187
188
  /*
189
   * U+0000...U+007F     00...7F
190
   * U+0080...U+07FF     C2...DF 80...BF
191
   * U+0800...U+0FFF     E0      A0...BF 80...BF
192
   * U+1000...U+CFFF     E1...EC 80...BF 80...BF
193
   * U+D000...U+D7FF     ED      80...9F 80...BF
194
   * U+E000...U+FFFF     EE...EF 80...BF 80...BF
195
   * U+10000...U+3FFFF   F0      90...BF 80...BF 80...BF
196
   * U+40000...U+FFFFF   F1...F3 80...BF 80...BF 80...BF
197
   * U+100000...U+10FFFF F4      80...8F 80...BF 80...BF
198
   */
199
200
  /* First we compute the type for each byte, as given by the table below.
201
   * This type will be used as an index later on.
202
   */
203
204
  /*
205
   * Index  Min Max Byte Type
206
   *  0     00  7F  Single byte sequence
207
   *  1,2,3 80  BF  Second, third and fourth byte for many of the sequences.
208
   *  4     A0  BF  Second byte after E0
209
   *  5     80  9F  Second byte after ED
210
   *  6     90  BF  Second byte after F0
211
   *  7     80  8F  Second byte after F4
212
   *  8     C2  F4  First non ASCII byte
213
   *  9..15 7F  80  Invalid byte
214
   */
215
216
  /* After the first step we compute the index for all bytes, then we permute
217
     the bytes according to their indices to check the ranges from the range
218
     table.
219
   * The range for a given type can be found in the range_min_table and
220
     range_max_table, the range for type/index X is in range_min_table[X] ...
221
     range_max_table[X].
222
   */
223
224
  /* Algorithm:
225
   * Put index zero to all bytes.
226
   * Find all non ASCII characters, give them index 8.
227
   * For each tail byte in a codepoint sequence, give it an index corresponding
228
     to the 1 based index from the end.
229
   * If the first byte of the codepoint is in the [C0...DF] range, we write
230
     index 1 in the following byte.
231
   * If the first byte of the codepoint is in the range [E0...EF], we write
232
     indices 2 and 1 in the next two bytes.
233
   * If the first byte of the codepoint is in the range [F0...FF] we write
234
     indices 3,2,1 into the next three bytes.
235
   * For finding the number of bytes we need to look at high nibbles (4 bits)
236
     and do the lookup from the table, it can be done with shift by 4 + shuffle
237
     instructions. We call it `first_len`.
238
   * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
239
   * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
240
   * Again to get the indices of the 4th bytes.
241
   * Take OR of all that 4 values and check within range.
242
   */
243
  /* For example:
244
   * input       C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
245
   * first_len   1  0  0  2  0  0  0  3  0  0  0  0  3  0  0  0
246
   * 1st byte    8  0  0  8  0  0  0  8  0  0  0  0  8  0  0  0
247
   * 2nd byte    0  1  0  0  2  0  0  0  3  0  0  0  0  3  0  0 // Shift + sub
248
   * 3rd byte    0  0  0  0  0  1  0  0  0  2  0  0  0  0  2  0 // Shift + sub
249
   * 4th byte    0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  1 // Shift + sub
250
   * Index       8  1  0  8  2  1  0  8  3  2  1  0  8  3  2  1 // OR of results
251
   */
252
253
  /* Checking for errors:
254
   * Error checking is done by looking up the high nibble (4 bits) of each byte
255
     against an error checking table.
256
   * Because the lookup value for the second byte depends of the value of the
257
     first byte in codepoint, we use saturated operations to adjust the index.
258
   * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
259
     match the correct index.
260
       * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
261
         F4 -> 5
262
       * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
263
         match the adjustment
264
       * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
265
         be more 128 and lookup in ef_fe_table will return 0 but for F0
266
         and F4 it will be 4 and 5 accordingly
267
   */
268
  /*
269
   * Then just check the appropriate ranges with greater/smaller equal
270
     instructions. Check tail with a naive algorithm.
271
   * To save from previous 16 byte checks we just align previous_first_len to
272
     get correct continuations of the codepoints.
273
   */
274
275
  /*
276
   * Map high nibble of "First Byte" to legal character length minus 1
277
   * 0x00 ~ 0xBF --> 0
278
   * 0xC0 ~ 0xDF --> 1
279
   * 0xE0 ~ 0xEF --> 2
280
   * 0xF0 ~ 0xFF --> 3
281
   */
282
  const __m128i first_len_table =
283
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
284
285
  /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
286
  const __m128i first_range_table =
287
      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
288
289
  /*
290
   * Range table, map range index to min and max values
291
   */
292
  const __m128i range_min_table =
293
      _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
294
                    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
295
296
  const __m128i range_max_table =
297
      _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
298
                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
299
300
  /*
301
   * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
302
   * which the Second Byte are not 80~BF. It contains "range index adjustment".
303
   * +------------+---------------+------------------+----------------+
304
   * | First Byte | original range| range adjustment | adjusted range |
305
   * +------------+---------------+------------------+----------------+
306
   * | E0         | 2             | 2                | 4              |
307
   * +------------+---------------+------------------+----------------+
308
   * | ED         | 2             | 3                | 5              |
309
   * +------------+---------------+------------------+----------------+
310
   * | F0         | 3             | 3                | 6              |
311
   * +------------+---------------+------------------+----------------+
312
   * | F4         | 4             | 4                | 8              |
313
   * +------------+---------------+------------------+----------------+
314
   */
315
316
  /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
317
  // The values represent the adjustment in the Range Index table for a correct
318
  // index.
319
  const __m128i df_ee_table =
320
      _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
321
322
  /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
323
  // The values represent the adjustment in the Range Index table for a correct
324
  // index.
325
  const __m128i ef_fe_table =
326
      _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
327
328
  __m128i prev_input = _mm_set1_epi8(0);
329
  __m128i prev_first_len = _mm_set1_epi8(0);
330
  __m128i error = _mm_set1_epi8(0);
331
  while (end - data >= 16) {
332
    const __m128i input =
333
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
334
335
    /* high_nibbles = input >> 4 */
336
    const __m128i high_nibbles =
337
        _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
338
339
    /* first_len = legal character length minus 1 */
340
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
341
    /* first_len = first_len_table[high_nibbles] */
342
    __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
343
344
    /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
345
    /* range = first_range_table[high_nibbles] */
346
    __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
347
348
    /* Second Byte: set range index to first_len */
349
    /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
350
    /* range |= (first_len, prev_first_len) << 1 byte */
351
    range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
352
353
    /* Third Byte: set range index to saturate_sub(first_len, 1) */
354
    /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
355
    __m128i tmp1;
356
    __m128i tmp2;
357
    /* tmp1 = saturate_sub(first_len, 1) */
358
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
359
    /* tmp2 = saturate_sub(prev_first_len, 1) */
360
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
361
    /* range |= (tmp1, tmp2) << 2 bytes */
362
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
363
364
    /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
365
    /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
366
    /* tmp1 = saturate_sub(first_len, 2) */
367
    tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
368
    /* tmp2 = saturate_sub(prev_first_len, 2) */
369
    tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
370
    /* range |= (tmp1, tmp2) << 3 bytes */
371
    range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
372
373
    /*
374
     * Now we have below range indices calculated
375
     * Correct cases:
376
     * - 8 for C0~FF
377
     * - 3 for 1st byte after F0~FF
378
     * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
379
     * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
380
     *         3rd byte after F0~FF
381
     * - 0 for others
382
     * Error cases:
383
     *   >9 for non ascii First Byte overlapping
384
     *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
385
     */
386
387
    /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
388
    /* Overlaps lead to index 9~15, which are illegal in range table */
389
    __m128i shift1;
390
    __m128i pos;
391
    __m128i range2;
392
    /* shift1 = (input, prev_input) << 1 byte */
393
    shift1 = _mm_alignr_epi8(input, prev_input, 15);
394
    pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
395
    /*
396
     * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
397
     * pos:     | 0   1      15 | 16  17           239| 240 241    255|
398
     * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
399
     * pos+112: | 112 113    127|       >= 128        |     >= 128    |
400
     */
401
    tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
402
    range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
403
    tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
404
    range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
405
406
    range = _mm_add_epi8(range, range2);
407
408
    /* Load min and max values per calculated range index */
409
    __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
410
    __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
411
412
    /* Check value range */
413
    if (ReturnPosition) {
414
      error = _mm_cmplt_epi8(input, min_range);
415
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
416
      /* 5% performance drop from this conditional branch */
417
      if (!_mm_testz_si128(error, error)) {
418
        break;
419
      }
420
    } else {
421
      error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
422
      error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
423
    }
424
425
    prev_input = input;
426
    prev_first_len = first_len;
427
428
    data += 16;
429
  }
430
  /* If we got to the end, we don't need to skip any bytes backwards */
431
  if (ReturnPosition && (data - (end - len)) == 0) {
432
    return ValidUTF8Span<true>(data, end);
433
  }
434
  /* Find previous codepoint (not 80~BF) */
435
  data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
436
  if (ReturnPosition) {
437
    return (data - (end - len)) + ValidUTF8Span<true>(data, end);
438
  }
439
  /* Test if there was any error */
440
  if (!_mm_testz_si128(error, error)) {
441
    return 0;
442
  }
443
  /* Check the tail */
444
  return ValidUTF8Span<false>(data, end);
445
#endif
446
958
}
Unexecuted instantiation: utf8_validity.cc:unsigned long utf8_range::(anonymous namespace)::ValidUTF8<true>(char const*, unsigned long)
447
448
}  // namespace
449
450
958
bool IsStructurallyValid(absl::string_view str) {
451
958
  return ValidUTF8</*ReturnPosition=*/false>(str.data(), str.size());
452
958
}
453
454
0
size_t SpanStructurallyValid(absl::string_view str) {
455
0
  return ValidUTF8</*ReturnPosition=*/true>(str.data(), str.size());
456
0
}
457
458
}  // namespace utf8_range