/rust/registry/src/index.crates.io-1949cf8c6b5b557f/bstr-1.12.1/src/ascii.rs
Line | Count | Source |
1 | | // The following ~400 lines of code exists for exactly one purpose, which is |
2 | | // to optimize this code: |
3 | | // |
4 | | // byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len()) |
5 | | // |
6 | | // Yes... Overengineered is a word that comes to mind, but this is effectively |
7 | | // a very similar problem to memchr, and virtually nobody has been able to |
8 | | // resist optimizing the crap out of that (except for perhaps the BSD and MUSL |
9 | | // folks). In particular, this routine makes a very common case (ASCII) very |
10 | | // fast, which seems worth it. We do stop short of adding AVX variants of the |
11 | | // code below in order to retain our sanity and also to avoid needing to deal |
12 | | // with runtime target feature detection. RESIST! |
13 | | // |
14 | | // In order to understand the SIMD version below, it would be good to read this |
15 | | // comment describing how my memchr routine works: |
16 | | // https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106 |
17 | | // |
18 | | // The primary difference with memchr is that for ASCII, we can do a bit less |
19 | | // work. In particular, we don't need to detect the presence of a specific |
20 | | // byte, but rather, whether any byte has its most significant bit set. That |
21 | | // means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to |
22 | | // _mm_movemask_epi8. |
23 | | |
24 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
25 | | const USIZE_BYTES: usize = core::mem::size_of::<usize>(); |
26 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
27 | | const ALIGN_MASK: usize = core::mem::align_of::<usize>() - 1; |
28 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
29 | | const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES; |
30 | | |
31 | | // This is a mask where the most significant bit of each byte in the usize |
32 | | // is set. We test this bit to determine whether a character is ASCII or not. |
33 | | // Namely, a single byte is regarded as an ASCII codepoint if and only if it's |
34 | | // most significant bit is not set. |
35 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
36 | | const ASCII_MASK_U64: u64 = 0x8080808080808080; |
37 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
38 | | const ASCII_MASK: usize = ASCII_MASK_U64 as usize; |
39 | | |
40 | | /// Returns the index of the first non ASCII byte in the given slice. |
41 | | /// |
42 | | /// If slice only contains ASCII bytes, then the length of the slice is |
43 | | /// returned. |
44 | 977k | pub fn first_non_ascii_byte(slice: &[u8]) -> usize { |
45 | | #[cfg(any(miri, not(target_arch = "x86_64")))] |
46 | | { |
47 | | first_non_ascii_byte_fallback(slice) |
48 | | } |
49 | | |
50 | | #[cfg(all(not(miri), target_arch = "x86_64"))] |
51 | | { |
52 | 977k | first_non_ascii_byte_sse2(slice) |
53 | | } |
54 | 977k | } Unexecuted instantiation: bstr::ascii::first_non_ascii_byte bstr::ascii::first_non_ascii_byte Line | Count | Source | 44 | 22.3k | pub fn first_non_ascii_byte(slice: &[u8]) -> usize { | 45 | | #[cfg(any(miri, not(target_arch = "x86_64")))] | 46 | | { | 47 | | first_non_ascii_byte_fallback(slice) | 48 | | } | 49 | | | 50 | | #[cfg(all(not(miri), target_arch = "x86_64"))] | 51 | | { | 52 | 22.3k | first_non_ascii_byte_sse2(slice) | 53 | | } | 54 | 22.3k | } |
bstr::ascii::first_non_ascii_byte Line | Count | Source | 44 | 792k | pub fn first_non_ascii_byte(slice: &[u8]) -> usize { | 45 | | #[cfg(any(miri, not(target_arch = "x86_64")))] | 46 | | { | 47 | | first_non_ascii_byte_fallback(slice) | 48 | | } | 49 | | | 50 | | #[cfg(all(not(miri), target_arch = "x86_64"))] | 51 | | { | 52 | 792k | first_non_ascii_byte_sse2(slice) | 53 | | } | 54 | 792k | } |
bstr::ascii::first_non_ascii_byte Line | Count | Source | 44 | 162k | pub fn first_non_ascii_byte(slice: &[u8]) -> usize { | 45 | | #[cfg(any(miri, not(target_arch = "x86_64")))] | 46 | | { | 47 | | first_non_ascii_byte_fallback(slice) | 48 | | } | 49 | | | 50 | | #[cfg(all(not(miri), target_arch = "x86_64"))] | 51 | | { | 52 | 162k | first_non_ascii_byte_sse2(slice) | 53 | | } | 54 | 162k | } |
|
55 | | |
56 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
57 | | fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { |
58 | | let start_ptr = slice.as_ptr(); |
59 | | let end_ptr = slice[slice.len()..].as_ptr(); |
60 | | let mut ptr = start_ptr; |
61 | | |
62 | | unsafe { |
63 | | if slice.len() < USIZE_BYTES { |
64 | | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); |
65 | | } |
66 | | |
67 | | let chunk = read_unaligned_usize(ptr); |
68 | | let mask = chunk & ASCII_MASK; |
69 | | if mask != 0 { |
70 | | return first_non_ascii_byte_mask(mask); |
71 | | } |
72 | | |
73 | | ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & ALIGN_MASK)); |
74 | | debug_assert!(ptr > start_ptr); |
75 | | debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); |
76 | | if slice.len() >= FALLBACK_LOOP_SIZE { |
77 | | while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) { |
78 | | debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); |
79 | | |
80 | | let a = *(ptr as *const usize); |
81 | | let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize); |
82 | | if (a | b) & ASCII_MASK != 0 { |
83 | | // What a kludge. We wrap the position finding code into |
84 | | // a non-inlineable function, which makes the codegen in |
85 | | // the tight loop above a bit better by avoiding a |
86 | | // couple extra movs. We pay for it by two additional |
87 | | // stores, but only in the case of finding a non-ASCII |
88 | | // byte. |
89 | | #[inline(never)] |
90 | | unsafe fn findpos( |
91 | | start_ptr: *const u8, |
92 | | ptr: *const u8, |
93 | | ) -> usize { |
94 | | let a = *(ptr as *const usize); |
95 | | let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize); |
96 | | |
97 | | let mut at = sub(ptr, start_ptr); |
98 | | let maska = a & ASCII_MASK; |
99 | | if maska != 0 { |
100 | | return at + first_non_ascii_byte_mask(maska); |
101 | | } |
102 | | |
103 | | at += USIZE_BYTES; |
104 | | let maskb = b & ASCII_MASK; |
105 | | debug_assert!(maskb != 0); |
106 | | return at + first_non_ascii_byte_mask(maskb); |
107 | | } |
108 | | return findpos(start_ptr, ptr); |
109 | | } |
110 | | ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE); |
111 | | } |
112 | | } |
113 | | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) |
114 | | } |
115 | | } |
116 | | |
117 | | #[cfg(all(not(miri), target_arch = "x86_64"))] |
118 | 977k | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { |
119 | | use core::arch::x86_64::*; |
120 | | |
121 | | const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); |
122 | | const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; |
123 | | const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; |
124 | | |
125 | 977k | let start_ptr = slice.as_ptr(); |
126 | 977k | let end_ptr = slice[slice.len()..].as_ptr(); |
127 | 977k | let mut ptr = start_ptr; |
128 | | |
129 | | unsafe { |
130 | 977k | if slice.len() < VECTOR_SIZE { |
131 | 489k | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); |
132 | 488k | } |
133 | | |
134 | 488k | let chunk = _mm_loadu_si128(ptr as *const __m128i); |
135 | 488k | let mask = _mm_movemask_epi8(chunk); |
136 | 488k | if mask != 0 { |
137 | 75.0k | return mask.trailing_zeros() as usize; |
138 | 413k | } |
139 | | |
140 | 413k | ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); |
141 | 413k | debug_assert!(ptr > start_ptr); |
142 | 413k | debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); |
143 | 413k | if slice.len() >= VECTOR_LOOP_SIZE { |
144 | 4.76M | while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { |
145 | 4.74M | debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); |
146 | | |
147 | 4.74M | let a = _mm_load_si128(ptr as *const __m128i); |
148 | 4.74M | let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); |
149 | 4.74M | let c = |
150 | 4.74M | _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); |
151 | 4.74M | let d = |
152 | 4.74M | _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); |
153 | | |
154 | 4.74M | let or1 = _mm_or_si128(a, b); |
155 | 4.74M | let or2 = _mm_or_si128(c, d); |
156 | 4.74M | let or3 = _mm_or_si128(or1, or2); |
157 | 4.74M | if _mm_movemask_epi8(or3) != 0 { |
158 | 91.4k | let mut at = sub(ptr, start_ptr); |
159 | 91.4k | let mask = _mm_movemask_epi8(a); |
160 | 91.4k | if mask != 0 { |
161 | 22.2k | return at + mask.trailing_zeros() as usize; |
162 | 69.2k | } |
163 | | |
164 | 69.2k | at += VECTOR_SIZE; |
165 | 69.2k | let mask = _mm_movemask_epi8(b); |
166 | 69.2k | if mask != 0 { |
167 | 23.0k | return at + mask.trailing_zeros() as usize; |
168 | 46.2k | } |
169 | | |
170 | 46.2k | at += VECTOR_SIZE; |
171 | 46.2k | let mask = _mm_movemask_epi8(c); |
172 | 46.2k | if mask != 0 { |
173 | 29.3k | return at + mask.trailing_zeros() as usize; |
174 | 16.8k | } |
175 | | |
176 | 16.8k | at += VECTOR_SIZE; |
177 | 16.8k | let mask = _mm_movemask_epi8(d); |
178 | 16.8k | debug_assert!(mask != 0); |
179 | 16.8k | return at + mask.trailing_zeros() as usize; |
180 | 4.65M | } |
181 | 4.65M | ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); |
182 | | } |
183 | 304k | } |
184 | 493k | while ptr <= end_ptr.sub(VECTOR_SIZE) { |
185 | 174k | debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); |
186 | | |
187 | 174k | let chunk = _mm_loadu_si128(ptr as *const __m128i); |
188 | 174k | let mask = _mm_movemask_epi8(chunk); |
189 | 174k | if mask != 0 { |
190 | 2.90k | return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; |
191 | 171k | } |
192 | 171k | ptr = ptr.add(VECTOR_SIZE); |
193 | | } |
194 | 318k | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) |
195 | | } |
196 | 977k | } Unexecuted instantiation: bstr::ascii::first_non_ascii_byte_sse2 bstr::ascii::first_non_ascii_byte_sse2 Line | Count | Source | 118 | 22.3k | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { | 119 | | use core::arch::x86_64::*; | 120 | | | 121 | | const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); | 122 | | const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; | 123 | | const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; | 124 | | | 125 | 22.3k | let start_ptr = slice.as_ptr(); | 126 | 22.3k | let end_ptr = slice[slice.len()..].as_ptr(); | 127 | 22.3k | let mut ptr = start_ptr; | 128 | | | 129 | | unsafe { | 130 | 22.3k | if slice.len() < VECTOR_SIZE { | 131 | 387 | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); | 132 | 22.0k | } | 133 | | | 134 | 22.0k | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 135 | 22.0k | let mask = _mm_movemask_epi8(chunk); | 136 | 22.0k | if mask != 0 { | 137 | 3.30k | return mask.trailing_zeros() as usize; | 138 | 18.7k | } | 139 | | | 140 | 18.7k | ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | 141 | 18.7k | debug_assert!(ptr > start_ptr); | 142 | 18.7k | debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); | 143 | 18.7k | if slice.len() >= VECTOR_LOOP_SIZE { | 144 | 194k | while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { | 145 | 194k | debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | 146 | | | 147 | 194k | let a = _mm_load_si128(ptr as *const __m128i); | 148 | 194k | let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); | 149 | 194k | let c = | 150 | 194k | _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); | 151 | 194k | let d = | 152 | 194k | _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); | 153 | | | 154 | 194k | let or1 = _mm_or_si128(a, b); | 155 | 194k | let or2 = _mm_or_si128(c, d); | 156 | 194k | let or3 = _mm_or_si128(or1, or2); | 157 | 194k | if _mm_movemask_epi8(or3) != 0 { | 158 | 18.4k | let mut at = sub(ptr, start_ptr); | 159 | 18.4k | let mask = _mm_movemask_epi8(a); | 160 | 18.4k | if mask != 0 { | 161 | 2.80k | return at + mask.trailing_zeros() as usize; | 162 | 15.6k | } | 163 | | | 164 | 15.6k | at += VECTOR_SIZE; | 165 | 15.6k | let mask = _mm_movemask_epi8(b); | 166 | 15.6k | if mask != 0 { | 167 | 2.83k | return at + mask.trailing_zeros() as usize; | 168 | 12.8k | } | 169 | | | 170 | 12.8k | at += VECTOR_SIZE; | 171 | 12.8k | let mask = _mm_movemask_epi8(c); | 172 | 12.8k | if mask != 0 { | 173 | 7.99k | return at + mask.trailing_zeros() as usize; | 174 | 4.82k | } | 175 | | | 176 | 4.82k | at += VECTOR_SIZE; | 177 | 4.82k | let mask = _mm_movemask_epi8(d); | 178 | 4.82k | debug_assert!(mask != 0); | 179 | 4.82k | return at + mask.trailing_zeros() as usize; | 180 | 175k | } | 181 | 175k | ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); | 182 | | } | 183 | 135 | } | 184 | 438 | while ptr <= end_ptr.sub(VECTOR_SIZE) { | 185 | 271 | debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); | 186 | | | 187 | 271 | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 188 | 271 | let mask = _mm_movemask_epi8(chunk); | 189 | 271 | if mask != 0 { | 190 | 82 | return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; | 191 | 189 | } | 192 | 189 | ptr = ptr.add(VECTOR_SIZE); | 193 | | } | 194 | 167 | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) | 195 | | } | 196 | 22.3k | } |
bstr::ascii::first_non_ascii_byte_sse2 Line | Count | Source | 118 | 792k | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { | 119 | | use core::arch::x86_64::*; | 120 | | | 121 | | const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); | 122 | | const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; | 123 | | const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; | 124 | | | 125 | 792k | let start_ptr = slice.as_ptr(); | 126 | 792k | let end_ptr = slice[slice.len()..].as_ptr(); | 127 | 792k | let mut ptr = start_ptr; | 128 | | | 129 | | unsafe { | 130 | 792k | if slice.len() < VECTOR_SIZE { | 131 | 439k | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); | 132 | 353k | } | 133 | | | 134 | 353k | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 135 | 353k | let mask = _mm_movemask_epi8(chunk); | 136 | 353k | if mask != 0 { | 137 | 4.45k | return mask.trailing_zeros() as usize; | 138 | 348k | } | 139 | | | 140 | 348k | ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | 141 | 348k | debug_assert!(ptr > start_ptr); | 142 | 348k | debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); | 143 | 348k | if slice.len() >= VECTOR_LOOP_SIZE { | 144 | 4.09M | while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { | 145 | 4.07M | debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | 146 | | | 147 | 4.07M | let a = _mm_load_si128(ptr as *const __m128i); | 148 | 4.07M | let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); | 149 | 4.07M | let c = | 150 | 4.07M | _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); | 151 | 4.07M | let d = | 152 | 4.07M | _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); | 153 | | | 154 | 4.07M | let or1 = _mm_or_si128(a, b); | 155 | 4.07M | let or2 = _mm_or_si128(c, d); | 156 | 4.07M | let or3 = _mm_or_si128(or1, or2); | 157 | 4.07M | if _mm_movemask_epi8(or3) != 0 { | 158 | 34.6k | let mut at = sub(ptr, start_ptr); | 159 | 34.6k | let mask = _mm_movemask_epi8(a); | 160 | 34.6k | if mask != 0 { | 161 | 7.10k | return at + mask.trailing_zeros() as usize; | 162 | 27.5k | } | 163 | | | 164 | 27.5k | at += VECTOR_SIZE; | 165 | 27.5k | let mask = _mm_movemask_epi8(b); | 166 | 27.5k | if mask != 0 { | 167 | 11.7k | return at + mask.trailing_zeros() as usize; | 168 | 15.8k | } | 169 | | | 170 | 15.8k | at += VECTOR_SIZE; | 171 | 15.8k | let mask = _mm_movemask_epi8(c); | 172 | 15.8k | if mask != 0 { | 173 | 9.77k | return at + mask.trailing_zeros() as usize; | 174 | 6.03k | } | 175 | | | 176 | 6.03k | at += VECTOR_SIZE; | 177 | 6.03k | let mask = _mm_movemask_epi8(d); | 178 | 6.03k | debug_assert!(mask != 0); | 179 | 6.03k | return at + mask.trailing_zeros() as usize; | 180 | 4.04M | } | 181 | 4.04M | ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); | 182 | | } | 183 | 299k | } | 184 | 479k | while ptr <= end_ptr.sub(VECTOR_SIZE) { | 185 | 165k | debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); | 186 | | | 187 | 165k | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 188 | 165k | let mask = _mm_movemask_epi8(chunk); | 189 | 165k | if mask != 0 { | 190 | 314 | return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; | 191 | 165k | } | 192 | 165k | ptr = ptr.add(VECTOR_SIZE); | 193 | | } | 194 | 313k | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) | 195 | | } | 196 | 792k | } |
bstr::ascii::first_non_ascii_byte_sse2 Line | Count | Source | 118 | 162k | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { | 119 | | use core::arch::x86_64::*; | 120 | | | 121 | | const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); | 122 | | const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; | 123 | | const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; | 124 | | | 125 | 162k | let start_ptr = slice.as_ptr(); | 126 | 162k | let end_ptr = slice[slice.len()..].as_ptr(); | 127 | 162k | let mut ptr = start_ptr; | 128 | | | 129 | | unsafe { | 130 | 162k | if slice.len() < VECTOR_SIZE { | 131 | 49.9k | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); | 132 | 112k | } | 133 | | | 134 | 112k | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 135 | 112k | let mask = _mm_movemask_epi8(chunk); | 136 | 112k | if mask != 0 { | 137 | 67.2k | return mask.trailing_zeros() as usize; | 138 | 45.5k | } | 139 | | | 140 | 45.5k | ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | 141 | 45.5k | debug_assert!(ptr > start_ptr); | 142 | 45.5k | debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); | 143 | 45.5k | if slice.len() >= VECTOR_LOOP_SIZE { | 144 | 475k | while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { | 145 | 473k | debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | 146 | | | 147 | 473k | let a = _mm_load_si128(ptr as *const __m128i); | 148 | 473k | let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); | 149 | 473k | let c = | 150 | 473k | _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); | 151 | 473k | let d = | 152 | 473k | _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); | 153 | | | 154 | 473k | let or1 = _mm_or_si128(a, b); | 155 | 473k | let or2 = _mm_or_si128(c, d); | 156 | 473k | let or3 = _mm_or_si128(or1, or2); | 157 | 473k | if _mm_movemask_epi8(or3) != 0 { | 158 | 38.3k | let mut at = sub(ptr, start_ptr); | 159 | 38.3k | let mask = _mm_movemask_epi8(a); | 160 | 38.3k | if mask != 0 { | 161 | 12.3k | return at + mask.trailing_zeros() as usize; | 162 | 25.9k | } | 163 | | | 164 | 25.9k | at += VECTOR_SIZE; | 165 | 25.9k | let mask = _mm_movemask_epi8(b); | 166 | 25.9k | if mask != 0 { | 167 | 8.42k | return at + mask.trailing_zeros() as usize; | 168 | 17.5k | } | 169 | | | 170 | 17.5k | at += VECTOR_SIZE; | 171 | 17.5k | let mask = _mm_movemask_epi8(c); | 172 | 17.5k | if mask != 0 { | 173 | 11.5k | return at + mask.trailing_zeros() as usize; | 174 | 6.01k | } | 175 | | | 176 | 6.01k | at += VECTOR_SIZE; | 177 | 6.01k | let mask = _mm_movemask_epi8(d); | 178 | 6.01k | debug_assert!(mask != 0); | 179 | 6.01k | return at + mask.trailing_zeros() as usize; | 180 | 435k | } | 181 | 435k | ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); | 182 | | } | 183 | 5.00k | } | 184 | 13.3k | while ptr <= end_ptr.sub(VECTOR_SIZE) { | 185 | 8.67k | debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); | 186 | | | 187 | 8.67k | let chunk = _mm_loadu_si128(ptr as *const __m128i); | 188 | 8.67k | let mask = _mm_movemask_epi8(chunk); | 189 | 8.67k | if mask != 0 { | 190 | 2.50k | return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; | 191 | 6.17k | } | 192 | 6.17k | ptr = ptr.add(VECTOR_SIZE); | 193 | | } | 194 | 4.63k | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) | 195 | | } | 196 | 162k | } |
|
197 | | |
198 | | #[inline(always)] |
199 | 808k | unsafe fn first_non_ascii_byte_slow( |
200 | 808k | start_ptr: *const u8, |
201 | 808k | end_ptr: *const u8, |
202 | 808k | mut ptr: *const u8, |
203 | 808k | ) -> usize { |
204 | 808k | debug_assert!(start_ptr <= ptr); |
205 | 808k | debug_assert!(ptr <= end_ptr); |
206 | | |
207 | 6.20M | while ptr < end_ptr { |
208 | 5.41M | if *ptr > 0x7F { |
209 | 19.8k | return sub(ptr, start_ptr); |
210 | 5.39M | } |
211 | 5.39M | ptr = ptr.offset(1); |
212 | | } |
213 | 788k | sub(end_ptr, start_ptr) |
214 | 808k | } Unexecuted instantiation: bstr::ascii::first_non_ascii_byte_slow bstr::ascii::first_non_ascii_byte_slow Line | Count | Source | 199 | 554 | unsafe fn first_non_ascii_byte_slow( | 200 | 554 | start_ptr: *const u8, | 201 | 554 | end_ptr: *const u8, | 202 | 554 | mut ptr: *const u8, | 203 | 554 | ) -> usize { | 204 | 554 | debug_assert!(start_ptr <= ptr); | 205 | 554 | debug_assert!(ptr <= end_ptr); | 206 | | | 207 | 3.41k | while ptr < end_ptr { | 208 | 2.96k | if *ptr > 0x7F { | 209 | 106 | return sub(ptr, start_ptr); | 210 | 2.85k | } | 211 | 2.85k | ptr = ptr.offset(1); | 212 | | } | 213 | 448 | sub(end_ptr, start_ptr) | 214 | 554 | } |
bstr::ascii::first_non_ascii_byte_slow Line | Count | Source | 199 | 752k | unsafe fn first_non_ascii_byte_slow( | 200 | 752k | start_ptr: *const u8, | 201 | 752k | end_ptr: *const u8, | 202 | 752k | mut ptr: *const u8, | 203 | 752k | ) -> usize { | 204 | 752k | debug_assert!(start_ptr <= ptr); | 205 | 752k | debug_assert!(ptr <= end_ptr); | 206 | | | 207 | 5.92M | while ptr < end_ptr { | 208 | 5.17M | if *ptr > 0x7F { | 209 | 956 | return sub(ptr, start_ptr); | 210 | 5.17M | } | 211 | 5.17M | ptr = ptr.offset(1); | 212 | | } | 213 | 752k | sub(end_ptr, start_ptr) | 214 | 752k | } |
bstr::ascii::first_non_ascii_byte_slow Line | Count | Source | 199 | 54.6k | unsafe fn first_non_ascii_byte_slow( | 200 | 54.6k | start_ptr: *const u8, | 201 | 54.6k | end_ptr: *const u8, | 202 | 54.6k | mut ptr: *const u8, | 203 | 54.6k | ) -> usize { | 204 | 54.6k | debug_assert!(start_ptr <= ptr); | 205 | 54.6k | debug_assert!(ptr <= end_ptr); | 206 | | | 207 | 271k | while ptr < end_ptr { | 208 | 235k | if *ptr > 0x7F { | 209 | 18.7k | return sub(ptr, start_ptr); | 210 | 216k | } | 211 | 216k | ptr = ptr.offset(1); | 212 | | } | 213 | 35.8k | sub(end_ptr, start_ptr) | 214 | 54.6k | } |
|
215 | | |
216 | | /// Compute the position of the first ASCII byte in the given mask. |
217 | | /// |
218 | | /// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is |
219 | | /// 8 contiguous bytes of the slice being checked where *at least* one of those |
220 | | /// bytes is not an ASCII byte. |
221 | | /// |
222 | | /// The position returned is always in the inclusive range [0, 7]. |
223 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
224 | | fn first_non_ascii_byte_mask(mask: usize) -> usize { |
225 | | #[cfg(target_endian = "little")] |
226 | | { |
227 | | mask.trailing_zeros() as usize / 8 |
228 | | } |
229 | | #[cfg(target_endian = "big")] |
230 | | { |
231 | | mask.leading_zeros() as usize / 8 |
232 | | } |
233 | | } |
234 | | |
235 | | /// Increment the given pointer by the given amount. |
236 | 4.65M | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { |
237 | 4.65M | ptr.add(amt) |
238 | 4.65M | } Unexecuted instantiation: bstr::ascii::ptr_add Line | Count | Source | 236 | 175k | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { | 237 | 175k | ptr.add(amt) | 238 | 175k | } |
Line | Count | Source | 236 | 4.04M | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { | 237 | 4.04M | ptr.add(amt) | 238 | 4.04M | } |
Line | Count | Source | 236 | 435k | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { | 237 | 435k | ptr.add(amt) | 238 | 435k | } |
|
239 | | |
240 | | /// Decrement the given pointer by the given amount. |
241 | 4.76M | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { |
242 | 4.76M | ptr.sub(amt) |
243 | 4.76M | } Unexecuted instantiation: bstr::ascii::ptr_sub Line | Count | Source | 241 | 194k | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { | 242 | 194k | ptr.sub(amt) | 243 | 194k | } |
Line | Count | Source | 241 | 4.09M | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { | 242 | 4.09M | ptr.sub(amt) | 243 | 4.09M | } |
Line | Count | Source | 241 | 475k | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { | 242 | 475k | ptr.sub(amt) | 243 | 475k | } |
|
244 | | |
245 | | #[cfg(any(test, miri, not(target_arch = "x86_64")))] |
246 | | unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { |
247 | | use core::ptr; |
248 | | |
249 | | let mut n: usize = 0; |
250 | | ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES); |
251 | | n |
252 | | } |
253 | | |
254 | | /// Subtract `b` from `a` and return the difference. `a` should be greater than |
255 | | /// or equal to `b`. |
256 | 1.07M | fn sub(a: *const u8, b: *const u8) -> usize { |
257 | 1.07M | debug_assert!(a >= b); |
258 | 1.07M | (a as usize) - (b as usize) |
259 | 1.07M | } Unexecuted instantiation: bstr::ascii::sub Line | Count | Source | 256 | 19.3k | fn sub(a: *const u8, b: *const u8) -> usize { | 257 | 19.3k | debug_assert!(a >= b); | 258 | 19.3k | (a as usize) - (b as usize) | 259 | 19.3k | } |
Line | Count | Source | 256 | 953k | fn sub(a: *const u8, b: *const u8) -> usize { | 257 | 953k | debug_assert!(a >= b); | 258 | 953k | (a as usize) - (b as usize) | 259 | 953k | } |
Line | Count | Source | 256 | 104k | fn sub(a: *const u8, b: *const u8) -> usize { | 257 | 104k | debug_assert!(a >= b); | 258 | 104k | (a as usize) - (b as usize) | 259 | 104k | } |
|
260 | | |
261 | | #[cfg(test)] |
262 | | mod tests { |
263 | | use super::*; |
264 | | |
265 | | // Our testing approach here is to try and exhaustively test every case. |
266 | | // This includes the position at which a non-ASCII byte occurs in addition |
267 | | // to the alignment of the slice that we're searching. |
268 | | |
269 | | #[test] |
270 | | fn positive_fallback_forward() { |
271 | | for i in 0..517 { |
272 | | let s = "a".repeat(i); |
273 | | assert_eq!( |
274 | | i, |
275 | | first_non_ascii_byte_fallback(s.as_bytes()), |
276 | | "i: {:?}, len: {:?}, s: {:?}", |
277 | | i, |
278 | | s.len(), |
279 | | s |
280 | | ); |
281 | | } |
282 | | } |
283 | | |
284 | | #[test] |
285 | | #[cfg(target_arch = "x86_64")] |
286 | | #[cfg(not(miri))] |
287 | | fn positive_sse2_forward() { |
288 | | for i in 0..517 { |
289 | | let b = "a".repeat(i).into_bytes(); |
290 | | assert_eq!(b.len(), first_non_ascii_byte_sse2(&b)); |
291 | | } |
292 | | } |
293 | | |
294 | | #[test] |
295 | | #[cfg(not(miri))] |
296 | | fn negative_fallback_forward() { |
297 | | for i in 0..517 { |
298 | | for align in 0..65 { |
299 | | let mut s = "a".repeat(i); |
300 | | s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); |
301 | | let s = s.get(align..).unwrap_or(""); |
302 | | assert_eq!( |
303 | | i.saturating_sub(align), |
304 | | first_non_ascii_byte_fallback(s.as_bytes()), |
305 | | "i: {:?}, align: {:?}, len: {:?}, s: {:?}", |
306 | | i, |
307 | | align, |
308 | | s.len(), |
309 | | s |
310 | | ); |
311 | | } |
312 | | } |
313 | | } |
314 | | |
315 | | #[test] |
316 | | #[cfg(target_arch = "x86_64")] |
317 | | #[cfg(not(miri))] |
318 | | fn negative_sse2_forward() { |
319 | | for i in 0..517 { |
320 | | for align in 0..65 { |
321 | | let mut s = "a".repeat(i); |
322 | | s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); |
323 | | let s = s.get(align..).unwrap_or(""); |
324 | | assert_eq!( |
325 | | i.saturating_sub(align), |
326 | | first_non_ascii_byte_sse2(s.as_bytes()), |
327 | | "i: {:?}, align: {:?}, len: {:?}, s: {:?}", |
328 | | i, |
329 | | align, |
330 | | s.len(), |
331 | | s |
332 | | ); |
333 | | } |
334 | | } |
335 | | } |
336 | | } |