/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.4.1/src/memmem/genericsimd.rs

Source (jump to first uncovered line)
use core::mem::size_of;

use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};

/// The minimum length of a needle required for this algorithm. The minimum
/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
/// a case handled by this searcher.
pub(crate) const MIN_NEEDLE_LEN: usize = 2;

/// The maximum length of a needle required for this algorithm.
///
/// In reality, there is no hard max here. The code below can handle any
/// length needle. (Perhaps that suggests there are missing optimizations.)
/// Instead, this is a heuristic and a bound guaranteeing our linear time
/// complexity.
///
/// It is a heuristic because when a candidate match is found, memcmp is run.
/// For very large needles with lots of false positives, memcmp can make the
/// code run quite slow.
///
/// It is a bound because the worst case behavior with memcmp is multiplicative
/// in the size of the needle and haystack, and we want to keep that additive.
/// This bound ensures we still meet that bound theoretically, since it's just
/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
/// is so fast that even in pathological cases (see pathological vector
/// benchmarks), this is still just as fast or faster in practice.
///
/// This specific number was chosen by tweaking a bit and running benchmarks.
/// The rare-medium-needle, for example, gets about 5% faster by using this
/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
/// theoretical desire to keep this number reasonably low, to mitigate the
/// impact of pathological cases. I did try 64, and some benchmarks got a
/// little better, and others (particularly the pathological ones), got a lot
/// worse. So... 32 it is?
pub(crate) const MAX_NEEDLE_LEN: usize = 32;

/// The implementation of the forward vector accelerated substring search.
///
/// This is extremely similar to the prefilter vector module by the same name.
/// The key difference is that this is not a prefilter. Instead, it handles
/// confirming its own matches. The trade off is that this only works with
/// smaller needles. The speed up here is that an inlined memcmp on a tiny
/// needle is very quick, even on pathological inputs. This is much better than
/// combining a prefilter with Two-Way, where using Two-Way to confirm the
/// match has higher latency.
///
/// So why not use this for all needles? We could, and it would probably work
/// really well on most inputs. But its worst case is multiplicative and we
/// want to guarantee worst case additive time. Some of the benchmarks try to
/// justify this (see the pathological ones).
///
/// The prefilter variant of this has more comments. Also note that we only
/// implement this for forward searches for now. If you have a compelling use
/// case for accelerated reverse search, please file an issue.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward {
    rare1i: u8,
    rare2i: u8,
}

impl Forward {
    /// Create a new "generic simd" forward searcher. If one could not be
    /// created from the given inputs, then None is returned.
    pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
        let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
        // If the needle is too short or too long, give up. Also, give up
        // if the rare bytes detected are at the same position. (It likely
        // suggests a degenerate case, although it should technically not be
        // possible.)
        if needle.len() < MIN_NEEDLE_LEN
            || needle.len() > MAX_NEEDLE_LEN
            || rare1i == rare2i
        {
            return None;
        }
        Some(Forward { rare1i, rare2i })
    }

    /// Returns the minimum length of haystack that is needed for this searcher
    /// to work for a particular vector. Passing a haystack with a length
    /// smaller than this will cause `fwd_find` to panic.
    #[inline(always)]
    pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
        self.rare2i as usize + size_of::<V>()
    }
}

/// Searches the given haystack for the given needle. The needle given should
/// be the same as the needle that this searcher was initialized with.
///
/// # Panics
///
/// When the given haystack has a length smaller than `min_haystack_len`.
///
/// # Safety
///
/// Since this is meant to be used with vector functions, callers need to
/// specialize this inside of a function with a `target_feature` attribute.
/// Therefore, callers must ensure that whatever target feature is being used
/// supports the vector functions that this function is specialized for. (For
/// the specific vector functions used, see the Vector trait implementations.)
#[inline(always)]
pub(crate) unsafe fn fwd_find<V: Vector>(
    fwd: &Forward,
    haystack: &[u8],
    needle: &[u8],
) -> Option<usize> {
    // It would be nice if we didn't have this check here, since the meta
    // searcher should handle it for us. But without this, I don't think we
    // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
    // put it as part of the safety contract, but it makes it more complicated
    // than necessary.
    if haystack.len() < needle.len() {
        return None;
    }
    let min_haystack_len = fwd.min_haystack_len::<V>();
    assert!(haystack.len() >= min_haystack_len, "haystack too small");
    debug_assert!(needle.len() <= haystack.len());
    debug_assert!(
        needle.len() >= MIN_NEEDLE_LEN,
        "needle must be at least {} bytes",
        MIN_NEEDLE_LEN,
    );
    debug_assert!(
        needle.len() <= MAX_NEEDLE_LEN,
        "needle must be at most {} bytes",
        MAX_NEEDLE_LEN,
    );

    let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
    let rare1chunk = V::splat(needle[rare1i]);
    let rare2chunk = V::splat(needle[rare2i]);

    let start_ptr = haystack.as_ptr();
    let end_ptr = start_ptr.add(haystack.len());
    let max_ptr = end_ptr.sub(min_haystack_len);
    let mut ptr = start_ptr;

    // N.B. I did experiment with unrolling the loop to deal with size(V)
    // bytes at a time and 2*size(V) bytes at a time. The double unroll was
    // marginally faster while the quadruple unroll was unambiguously slower.
    // In the end, I decided the complexity from unrolling wasn't worth it. I
    // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
    while ptr <= max_ptr {
        let m = fwd_find_in_chunk(
            fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
        );
        if let Some(chunki) = m {
            return Some(matched(start_ptr, ptr, chunki));
        }
        ptr = ptr.add(size_of::<V>());
    }
    if ptr < end_ptr {
        let remaining = diff(end_ptr, ptr);
        debug_assert!(
            remaining < min_haystack_len,
            "remaining bytes should be smaller than the minimum haystack \
             length of {}, but there are {} bytes remaining",
            min_haystack_len,
            remaining,
        );
        if remaining < needle.len() {
            return None;
        }
        debug_assert!(
            max_ptr < ptr,
            "after main loop, ptr should have exceeded max_ptr",
        );
        let overlap = diff(ptr, max_ptr);
        debug_assert!(
            overlap > 0,
            "overlap ({}) must always be non-zero",
            overlap,
        );
        debug_assert!(
            overlap < size_of::<V>(),
            "overlap ({}) cannot possibly be >= than a vector ({})",
            overlap,
            size_of::<V>(),
        );
        // The mask has all of its bits set except for the first N least
        // significant bits, where N=overlap. This way, any matches that
        // occur in find_in_chunk within the overlap are automatically
        // ignored.
        let mask = !((1 << overlap) - 1);
        ptr = max_ptr;
        let m = fwd_find_in_chunk(
            fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
        );
        if let Some(chunki) = m {
            return Some(matched(start_ptr, ptr, chunki));
        }
    }
    None
}

/// Search for an occurrence of two rare bytes from the needle in the chunk
/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
/// an occurrence is found, memcmp is run to check if a match occurs at the
/// corresponding position.
///
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
/// bytes repeated in each 8-bit lane, respectively.
///
/// mask should have bits set corresponding the positions in the chunk in which
/// matches are considered. This is only used for the last vector load where
/// the beginning of the vector might have overlapped with the last load in
/// the main loop. The mask lets us avoid visiting positions that have already
/// been discarded as matches.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at both
/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
/// loads on ptr up to (end_ptr - needle.len()).
#[inline(always)]
unsafe fn fwd_find_in_chunk<V: Vector>(
    fwd: &Forward,
    needle: &[u8],
    ptr: *const u8,
    end_ptr: *const u8,
    rare1chunk: V,
    rare2chunk: V,
    mask: u32,
) -> Option<usize> {
    let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
    let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));

    let eq0 = chunk0.cmpeq(rare1chunk);
    let eq1 = chunk1.cmpeq(rare2chunk);

    let mut match_offsets = eq0.and(eq1).movemask() & mask;
    while match_offsets != 0 {
        let offset = match_offsets.trailing_zeros() as usize;
        let ptr = ptr.add(offset);
        if end_ptr.sub(needle.len()) < ptr {
            return None;
        }
        let chunk = core::slice::from_raw_parts(ptr, needle.len());
        if memcmp(needle, chunk) {
            return Some(offset);
        }
        match_offsets &= match_offsets - 1;
    }
    None
}

/// Accepts a chunk-relative offset and returns a haystack relative offset
/// after updating the prefilter state.
///
/// See the same function with the same name in the prefilter variant of this
/// algorithm to learned why it's tagged with inline(never). Even here, where
/// the function is simpler, inlining it leads to poorer codegen. (Although
/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
#[cold]
#[inline(never)]
fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
    diff(ptr, start_ptr) + chunki
}

/// Subtract `b` from `a` and return the difference. `a` must be greater than
/// or equal to `b`.
fn diff(a: *const u8, b: *const u8) -> usize {
    debug_assert!(a >= b);
    (a as usize) - (b as usize)
}

Coverage Report

Created: 2024-04-26 06:25

Line	Count	Source (jump to first uncovered line)
1		use core::mem::size_of;
2
3		use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};
4
5		/// The minimum length of a needle required for this algorithm. The minimum
6		/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
7		/// a case handled by this searcher.
8		pub(crate) const MIN_NEEDLE_LEN: usize = 2;
9
10		/// The maximum length of a needle required for this algorithm.
11		///
12		/// In reality, there is no hard max here. The code below can handle any
13		/// length needle. (Perhaps that suggests there are missing optimizations.)
14		/// Instead, this is a heuristic and a bound guaranteeing our linear time
15		/// complexity.
16		///
17		/// It is a heuristic because when a candidate match is found, memcmp is run.
18		/// For very large needles with lots of false positives, memcmp can make the
19		/// code run quite slow.
20		///
21		/// It is a bound because the worst case behavior with memcmp is multiplicative
22		/// in the size of the needle and haystack, and we want to keep that additive.
23		/// This bound ensures we still meet that bound theoretically, since it's just
24		/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
25		/// is so fast that even in pathological cases (see pathological vector
26		/// benchmarks), this is still just as fast or faster in practice.
27		///
28		/// This specific number was chosen by tweaking a bit and running benchmarks.
29		/// The rare-medium-needle, for example, gets about 5% faster by using this
30		/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
31		/// theoretical desire to keep this number reasonably low, to mitigate the
32		/// impact of pathological cases. I did try 64, and some benchmarks got a
33		/// little better, and others (particularly the pathological ones), got a lot
34		/// worse. So... 32 it is?
35		pub(crate) const MAX_NEEDLE_LEN: usize = 32;
36
37		/// The implementation of the forward vector accelerated substring search.
38		///
39		/// This is extremely similar to the prefilter vector module by the same name.
40		/// The key difference is that this is not a prefilter. Instead, it handles
41		/// confirming its own matches. The trade off is that this only works with
42		/// smaller needles. The speed up here is that an inlined memcmp on a tiny
43		/// needle is very quick, even on pathological inputs. This is much better than
44		/// combining a prefilter with Two-Way, where using Two-Way to confirm the
45		/// match has higher latency.
46		///
47		/// So why not use this for all needles? We could, and it would probably work
48		/// really well on most inputs. But its worst case is multiplicative and we
49		/// want to guarantee worst case additive time. Some of the benchmarks try to
50		/// justify this (see the pathological ones).
51		///
52		/// The prefilter variant of this has more comments. Also note that we only
53		/// implement this for forward searches for now. If you have a compelling use
54		/// case for accelerated reverse search, please file an issue.
55	0	#[derive(Clone, Copy, Debug)]
56		pub(crate) struct Forward {
57		rare1i: u8,
58		rare2i: u8,
59		}
60
61		impl Forward {
62		/// Create a new "generic simd" forward searcher. If one could not be
63		/// created from the given inputs, then None is returned.
64	0	pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
65	0	let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
66	0	// If the needle is too short or too long, give up. Also, give up
67	0	// if the rare bytes detected are at the same position. (It likely
68	0	// suggests a degenerate case, although it should technically not be
69	0	// possible.)
70	0	if needle.len() < MIN_NEEDLE_LEN
71	0	\|\| needle.len() > MAX_NEEDLE_LEN
72	0	\|\| rare1i == rare2i
73		{
74	0	return None;
75	0	}
76	0	Some(Forward { rare1i, rare2i })
77	0	}
78
79		/// Returns the minimum length of haystack that is needed for this searcher
80		/// to work for a particular vector. Passing a haystack with a length
81		/// smaller than this will cause `fwd_find` to panic.
82		#[inline(always)]
83	0	pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
84	0	self.rare2i as usize + size_of::<V>()
85	0	} Unexecuted instantiation: <memchr::memmem::genericsimd::Forward>::min_haystack_len::<core::core_arch::x86::__m256i> Unexecuted instantiation: <memchr::memmem::genericsimd::Forward>::min_haystack_len::<core::core_arch::x86::__m128i>
86		}
87
88		/// Searches the given haystack for the given needle. The needle given should
89		/// be the same as the needle that this searcher was initialized with.
90		///
91		/// # Panics
92		///
93		/// When the given haystack has a length smaller than `min_haystack_len`.
94		///
95		/// # Safety
96		///
97		/// Since this is meant to be used with vector functions, callers need to
98		/// specialize this inside of a function with a `target_feature` attribute.
99		/// Therefore, callers must ensure that whatever target feature is being used
100		/// supports the vector functions that this function is specialized for. (For
101		/// the specific vector functions used, see the Vector trait implementations.)
102		#[inline(always)]
103	0	pub(crate) unsafe fn fwd_find<V: Vector>(
104	0	fwd: &Forward,
105	0	haystack: &[u8],
106	0	needle: &[u8],
107	0	) -> Option<usize> {
108	0	// It would be nice if we didn't have this check here, since the meta
109	0	// searcher should handle it for us. But without this, I don't think we
110	0	// guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
111	0	// put it as part of the safety contract, but it makes it more complicated
112	0	// than necessary.
113	0	if haystack.len() < needle.len() {
114	0	return None;
115	0	}
116	0	let min_haystack_len = fwd.min_haystack_len::<V>();
117	0	assert!(haystack.len() >= min_haystack_len, "haystack too small");
118	0	debug_assert!(needle.len() <= haystack.len());
119		debug_assert!(
120	0	needle.len() >= MIN_NEEDLE_LEN,
121	0	"needle must be at least {} bytes",
122		MIN_NEEDLE_LEN,
123		);
124		debug_assert!(
125	0	needle.len() <= MAX_NEEDLE_LEN,
126	0	"needle must be at most {} bytes",
127		MAX_NEEDLE_LEN,
128		);
129
130	0	let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
131	0	let rare1chunk = V::splat(needle[rare1i]);
132	0	let rare2chunk = V::splat(needle[rare2i]);
133	0
134	0	let start_ptr = haystack.as_ptr();
135	0	let end_ptr = start_ptr.add(haystack.len());
136	0	let max_ptr = end_ptr.sub(min_haystack_len);
137	0	let mut ptr = start_ptr;
138
139		// N.B. I did experiment with unrolling the loop to deal with size(V)
140		// bytes at a time and 2*size(V) bytes at a time. The double unroll was
141		// marginally faster while the quadruple unroll was unambiguously slower.
142		// In the end, I decided the complexity from unrolling wasn't worth it. I
143		// used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
144	0	while ptr <= max_ptr {
145	0	let m = fwd_find_in_chunk(
146	0	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
147	0	);
148	0	if let Some(chunki) = m {
149	0	return Some(matched(start_ptr, ptr, chunki));
150	0	}
151	0	ptr = ptr.add(size_of::<V>());
152		}
153	0	if ptr < end_ptr {
154	0	let remaining = diff(end_ptr, ptr);
155		debug_assert!(
156	0	remaining < min_haystack_len,
157	0	"remaining bytes should be smaller than the minimum haystack \
158	0	length of {}, but there are {} bytes remaining",
159		min_haystack_len,
160		remaining,
161		);
162	0	if remaining < needle.len() {
163	0	return None;
164	0	}
165		debug_assert!(
166	0	max_ptr < ptr,
167		"after main loop, ptr should have exceeded max_ptr",
168		);
169	0	let overlap = diff(ptr, max_ptr);
170		debug_assert!(
171	0	overlap > 0,
172	0	"overlap ({}) must always be non-zero",
173		overlap,
174		);
175		debug_assert!(
176	0	overlap < size_of::<V>(),
177	0	"overlap ({}) cannot possibly be >= than a vector ({})",
178		overlap,
179		size_of::<V>(),
180		);
181		// The mask has all of its bits set except for the first N least
182		// significant bits, where N=overlap. This way, any matches that
183		// occur in find_in_chunk within the overlap are automatically
184		// ignored.
185	0	let mask = !((1 << overlap) - 1);
186	0	ptr = max_ptr;
187	0	let m = fwd_find_in_chunk(
188	0	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
189	0	);
190	0	if let Some(chunki) = m {
191	0	return Some(matched(start_ptr, ptr, chunki));
192	0	}
193	0	}
194	0	None
195	0	} Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find::<core::core_arch::x86::__m256i> Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find::<core::core_arch::x86::__m128i>
196
197		/// Search for an occurrence of two rare bytes from the needle in the chunk
198		/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
199		/// an occurrence is found, memcmp is run to check if a match occurs at the
200		/// corresponding position.
201		///
202		/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
203		/// bytes repeated in each 8-bit lane, respectively.
204		///
205		/// mask should have bits set corresponding the positions in the chunk in which
206		/// matches are considered. This is only used for the last vector load where
207		/// the beginning of the vector might have overlapped with the last load in
208		/// the main loop. The mask lets us avoid visiting positions that have already
209		/// been discarded as matches.
210		///
211		/// # Safety
212		///
213		/// It must be safe to do an unaligned read of size(V) bytes starting at both
214		/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
215		/// loads on ptr up to (end_ptr - needle.len()).
216		#[inline(always)]
217	0	unsafe fn fwd_find_in_chunk<V: Vector>(
218	0	fwd: &Forward,
219	0	needle: &[u8],
220	0	ptr: *const u8,
221	0	end_ptr: *const u8,
222	0	rare1chunk: V,
223	0	rare2chunk: V,
224	0	mask: u32,
225	0	) -> Option<usize> {
226	0	let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
227	0	let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));
228	0
229	0	let eq0 = chunk0.cmpeq(rare1chunk);
230	0	let eq1 = chunk1.cmpeq(rare2chunk);
231	0
232	0	let mut match_offsets = eq0.and(eq1).movemask() & mask;
233	0	while match_offsets != 0 {
234	0	let offset = match_offsets.trailing_zeros() as usize;
235	0	let ptr = ptr.add(offset);
236	0	if end_ptr.sub(needle.len()) < ptr {
237	0	return None;
238	0	}
239	0	let chunk = core::slice::from_raw_parts(ptr, needle.len());
240	0	if memcmp(needle, chunk) {
241	0	return Some(offset);
242	0	}
243	0	match_offsets &= match_offsets - 1;
244		}
245	0	None
246	0	} Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find_in_chunk::<core::core_arch::x86::__m256i> Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find_in_chunk::<core::core_arch::x86::__m128i>
247
248		/// Accepts a chunk-relative offset and returns a haystack relative offset
249		/// after updating the prefilter state.
250		///
251		/// See the same function with the same name in the prefilter variant of this
252		/// algorithm to learned why it's tagged with inline(never). Even here, where
253		/// the function is simpler, inlining it leads to poorer codegen. (Although
254		/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
255		#[cold]
256		#[inline(never)]
257	0	fn matched(start_ptr: const u8, ptr: const u8, chunki: usize) -> usize {
258	0	diff(ptr, start_ptr) + chunki
259	0	}
260
261		/// Subtract `b` from `a` and return the difference. `a` must be greater than
262		/// or equal to `b`.
263	0	fn diff(a: const u8, b: const u8) -> usize {
264	0	debug_assert!(a >= b);
265	0	(a as usize) - (b as usize)
266	0	}