Coverage Report

Created: 2026-02-14 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/src/icelake/icelake_find.inl.cpp
Line
Count
Source
1
simdutf_really_inline const char *util_find(const char *start, const char *end,
2
0
                                            char character) noexcept {
3
  // Handle empty or invalid range
4
0
  if (start >= end)
5
0
    return end;
6
0
  const size_t step = 64;
7
0
  __m512i char_vec = _mm512_set1_epi8(character);
8
9
  // Handle unaligned beginning with a masked load
10
0
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % step;
11
0
  if (misalignment != 0) {
12
0
    size_t adjustment = step - misalignment;
13
0
    if (size_t(end - start) < adjustment) {
14
0
      adjustment = end - start;
15
0
    }
16
0
    __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - adjustment);
17
0
    __m512i data = _mm512_maskz_loadu_epi8(
18
0
        load_mask, reinterpret_cast<const __m512i *>(start));
19
0
    __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec);
20
21
0
    if (match_mask != 0) {
22
0
      size_t index = _tzcnt_u64(match_mask);
23
0
      return start + index;
24
0
    }
25
0
    start += adjustment;
26
0
  }
27
  // Process 64 bytes (512 bits) at a time with AVX-512
28
  // Main loop for full 128-byte chunks
29
0
  while (size_t(end - start) >= 2 * step) {
30
0
    __m512i data1 =
31
0
        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
32
0
    __mmask64 mask1 = _mm512_cmpeq_epi8_mask(data1, char_vec);
33
34
0
    __m512i data2 =
35
0
        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start + step));
36
0
    __mmask64 mask2 = _mm512_cmpeq_epi8_mask(data2, char_vec);
37
0
    if (!_kortestz_mask64_u8(mask1, mask2)) {
38
0
      if (mask1 != 0) {
39
        // Found a match, return the first one
40
0
        size_t index = _tzcnt_u64(mask1);
41
0
        return start + index;
42
0
      }
43
0
      size_t index = _tzcnt_u64(mask2);
44
0
      return start + index + step;
45
0
    }
46
0
    start += 2 * step;
47
0
  }
48
49
  // Main loop for full 64-byte chunks
50
0
  while (size_t(end - start) >= step) {
51
0
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
52
0
    __mmask64 mask = _mm512_cmpeq_epi8_mask(data, char_vec);
53
54
0
    if (mask != 0) {
55
      // Found a match, return the first one
56
0
      size_t index = _tzcnt_u64(mask);
57
0
      return start + index;
58
0
    }
59
60
0
    start += step;
61
0
  }
62
63
  // Handle remaining bytes with masked load
64
0
  size_t remaining = end - start;
65
0
  if (remaining > 0) {
66
    // Create a mask for the remaining bytes using shifted 0xFFFFFFFFFFFFFFFF
67
0
    __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - remaining);
68
0
    __m512i data = _mm512_maskz_loadu_epi8(
69
0
        load_mask, reinterpret_cast<const __m512i *>(start));
70
0
    __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec);
71
72
    // Apply load mask to avoid false positives
73
0
    match_mask &= load_mask;
74
75
0
    if (match_mask != 0) {
76
      // Found a match in the remaining bytes
77
0
      size_t index = _tzcnt_u64(match_mask);
78
0
      return start + index;
79
0
    }
80
0
  }
81
82
0
  return end;
83
0
}
84
85
simdutf_really_inline const char16_t *util_find(const char16_t *start,
86
                                                const char16_t *end,
87
0
                                                char16_t character) noexcept {
88
  // Handle empty or invalid range
89
0
  if (start >= end)
90
0
    return end;
91
92
  // Process 32 char16_t (64 bytes, 512 bits) at a time with AVX-512
93
0
  const size_t step = 32;
94
0
  __m512i char_vec = _mm512_set1_epi16(character);
95
96
  // Handle unaligned beginning with a masked load
97
0
  uintptr_t misalignment =
98
0
      reinterpret_cast<uintptr_t>(start) % (step * sizeof(char16_t));
99
0
  if (misalignment != 0 && misalignment % 2 == 0) {
100
0
    size_t adjustment =
101
0
        (step * sizeof(char16_t) - misalignment) / sizeof(char16_t);
102
0
    if (size_t(end - start) < adjustment) {
103
0
      adjustment = end - start;
104
0
    }
105
0
    __mmask32 load_mask = 0xFFFFFFFF >> (32 - adjustment);
106
0
    __m512i data = _mm512_maskz_loadu_epi16(
107
0
        load_mask, reinterpret_cast<const __m512i *>(start));
108
0
    __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec);
109
110
0
    if (match_mask != 0) {
111
0
      size_t index = _tzcnt_u32(match_mask);
112
0
      return start + index;
113
0
    }
114
0
    start += adjustment;
115
0
  }
116
117
  // Main loop for full 32-element chunks
118
0
  while (size_t(end - start) >= step) {
119
0
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
120
0
    __mmask32 mask = _mm512_cmpeq_epi16_mask(data, char_vec);
121
122
0
    if (mask != 0) {
123
      // Found a match, return the first one
124
0
      size_t index = _tzcnt_u32(mask);
125
0
      return start + index;
126
0
    }
127
128
0
    start += step;
129
0
  }
130
131
  // Handle remaining elements with masked load
132
0
  size_t remaining = end - start;
133
0
  if (remaining > 0) {
134
0
    __mmask32 load_mask = 0xFFFFFFFF >> (32 - remaining);
135
0
    __m512i data = _mm512_maskz_loadu_epi16(
136
0
        load_mask, reinterpret_cast<const __m512i *>(start));
137
0
    __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec);
138
139
0
    if (match_mask != 0) {
140
0
      size_t index = _tzcnt_u32(match_mask);
141
0
      return start + index;
142
0
    }
143
0
  }
144
145
0
  return end;
146
0
}