/src/simdutf/src/icelake/icelake_find.inl.cpp
Line | Count | Source |
1 | | simdutf_really_inline const char *util_find(const char *start, const char *end, |
2 | 0 | char character) noexcept { |
3 | | // Handle empty or invalid range |
4 | 0 | if (start >= end) |
5 | 0 | return end; |
6 | 0 | const size_t step = 64; |
7 | 0 | __m512i char_vec = _mm512_set1_epi8(character); |
8 | | |
9 | | // Handle unaligned beginning with a masked load |
10 | 0 | uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % step; |
11 | 0 | if (misalignment != 0) { |
12 | 0 | size_t adjustment = step - misalignment; |
13 | 0 | if (size_t(end - start) < adjustment) { |
14 | 0 | adjustment = end - start; |
15 | 0 | } |
16 | 0 | __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - adjustment); |
17 | 0 | __m512i data = _mm512_maskz_loadu_epi8( |
18 | 0 | load_mask, reinterpret_cast<const __m512i *>(start)); |
19 | 0 | __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec); |
20 | |
|
21 | 0 | if (match_mask != 0) { |
22 | 0 | size_t index = _tzcnt_u64(match_mask); |
23 | 0 | return start + index; |
24 | 0 | } |
25 | 0 | start += adjustment; |
26 | 0 | } |
27 | | // Process 64 bytes (512 bits) at a time with AVX-512 |
28 | | // Main loop for full 128-byte chunks |
29 | 0 | while (size_t(end - start) >= 2 * step) { |
30 | 0 | __m512i data1 = |
31 | 0 | _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start)); |
32 | 0 | __mmask64 mask1 = _mm512_cmpeq_epi8_mask(data1, char_vec); |
33 | |
|
34 | 0 | __m512i data2 = |
35 | 0 | _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start + step)); |
36 | 0 | __mmask64 mask2 = _mm512_cmpeq_epi8_mask(data2, char_vec); |
37 | 0 | if (!_kortestz_mask64_u8(mask1, mask2)) { |
38 | 0 | if (mask1 != 0) { |
39 | | // Found a match, return the first one |
40 | 0 | size_t index = _tzcnt_u64(mask1); |
41 | 0 | return start + index; |
42 | 0 | } |
43 | 0 | size_t index = _tzcnt_u64(mask2); |
44 | 0 | return start + index + step; |
45 | 0 | } |
46 | 0 | start += 2 * step; |
47 | 0 | } |
48 | | |
49 | | // Main loop for full 64-byte chunks |
50 | 0 | while (size_t(end - start) >= step) { |
51 | 0 | __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start)); |
52 | 0 | __mmask64 mask = _mm512_cmpeq_epi8_mask(data, char_vec); |
53 | |
|
54 | 0 | if (mask != 0) { |
55 | | // Found a match, return the first one |
56 | 0 | size_t index = _tzcnt_u64(mask); |
57 | 0 | return start + index; |
58 | 0 | } |
59 | | |
60 | 0 | start += step; |
61 | 0 | } |
62 | | |
63 | | // Handle remaining bytes with masked load |
64 | 0 | size_t remaining = end - start; |
65 | 0 | if (remaining > 0) { |
66 | | // Create a mask for the remaining bytes using shifted 0xFFFFFFFFFFFFFFFF |
67 | 0 | __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - remaining); |
68 | 0 | __m512i data = _mm512_maskz_loadu_epi8( |
69 | 0 | load_mask, reinterpret_cast<const __m512i *>(start)); |
70 | 0 | __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec); |
71 | | |
72 | | // Apply load mask to avoid false positives |
73 | 0 | match_mask &= load_mask; |
74 | |
|
75 | 0 | if (match_mask != 0) { |
76 | | // Found a match in the remaining bytes |
77 | 0 | size_t index = _tzcnt_u64(match_mask); |
78 | 0 | return start + index; |
79 | 0 | } |
80 | 0 | } |
81 | | |
82 | 0 | return end; |
83 | 0 | } |
84 | | |
85 | | simdutf_really_inline const char16_t *util_find(const char16_t *start, |
86 | | const char16_t *end, |
87 | 0 | char16_t character) noexcept { |
88 | | // Handle empty or invalid range |
89 | 0 | if (start >= end) |
90 | 0 | return end; |
91 | | |
92 | | // Process 32 char16_t (64 bytes, 512 bits) at a time with AVX-512 |
93 | 0 | const size_t step = 32; |
94 | 0 | __m512i char_vec = _mm512_set1_epi16(character); |
95 | | |
96 | | // Handle unaligned beginning with a masked load |
97 | 0 | uintptr_t misalignment = |
98 | 0 | reinterpret_cast<uintptr_t>(start) % (step * sizeof(char16_t)); |
99 | 0 | if (misalignment != 0 && misalignment % 2 == 0) { |
100 | 0 | size_t adjustment = |
101 | 0 | (step * sizeof(char16_t) - misalignment) / sizeof(char16_t); |
102 | 0 | if (size_t(end - start) < adjustment) { |
103 | 0 | adjustment = end - start; |
104 | 0 | } |
105 | 0 | __mmask32 load_mask = 0xFFFFFFFF >> (32 - adjustment); |
106 | 0 | __m512i data = _mm512_maskz_loadu_epi16( |
107 | 0 | load_mask, reinterpret_cast<const __m512i *>(start)); |
108 | 0 | __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec); |
109 | |
|
110 | 0 | if (match_mask != 0) { |
111 | 0 | size_t index = _tzcnt_u32(match_mask); |
112 | 0 | return start + index; |
113 | 0 | } |
114 | 0 | start += adjustment; |
115 | 0 | } |
116 | | |
117 | | // Main loop for full 32-element chunks |
118 | 0 | while (size_t(end - start) >= step) { |
119 | 0 | __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start)); |
120 | 0 | __mmask32 mask = _mm512_cmpeq_epi16_mask(data, char_vec); |
121 | |
|
122 | 0 | if (mask != 0) { |
123 | | // Found a match, return the first one |
124 | 0 | size_t index = _tzcnt_u32(mask); |
125 | 0 | return start + index; |
126 | 0 | } |
127 | | |
128 | 0 | start += step; |
129 | 0 | } |
130 | | |
131 | | // Handle remaining elements with masked load |
132 | 0 | size_t remaining = end - start; |
133 | 0 | if (remaining > 0) { |
134 | 0 | __mmask32 load_mask = 0xFFFFFFFF >> (32 - remaining); |
135 | 0 | __m512i data = _mm512_maskz_loadu_epi16( |
136 | 0 | load_mask, reinterpret_cast<const __m512i *>(start)); |
137 | 0 | __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec); |
138 | |
|
139 | 0 | if (match_mask != 0) { |
140 | 0 | size_t index = _tzcnt_u32(match_mask); |
141 | 0 | return start + index; |
142 | 0 | } |
143 | 0 | } |
144 | | |
145 | 0 | return end; |
146 | 0 | } |