/src/serenity/AK/CircularBuffer.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022, Lucas Chollet <lucas.chollet@free.fr> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/CircularBuffer.h> |
8 | | #include <AK/MemMem.h> |
9 | | #include <AK/Stream.h> |
10 | | |
11 | | namespace AK { |
12 | | |
13 | | CircularBuffer::CircularBuffer(ByteBuffer buffer) |
14 | 64.9k | : m_buffer(move(buffer)) |
15 | 64.9k | { |
16 | 64.9k | } |
17 | | |
18 | | ErrorOr<CircularBuffer> CircularBuffer::create_empty(size_t size) |
19 | 62.2k | { |
20 | 62.2k | auto temporary_buffer = TRY(ByteBuffer::create_uninitialized(size)); |
21 | | |
22 | 62.2k | CircularBuffer circular_buffer { move(temporary_buffer) }; |
23 | | |
24 | 62.2k | return circular_buffer; |
25 | 62.2k | } |
26 | | |
27 | | ErrorOr<CircularBuffer> CircularBuffer::create_initialized(ByteBuffer buffer) |
28 | 0 | { |
29 | 0 | CircularBuffer circular_buffer { move(buffer) }; |
30 | |
|
31 | 0 | circular_buffer.m_used_space = circular_buffer.m_buffer.size(); |
32 | |
|
33 | 0 | return circular_buffer; |
34 | 0 | } |
35 | | |
36 | | size_t CircularBuffer::empty_space() const |
37 | 494M | { |
38 | 494M | return capacity() - m_used_space; |
39 | 494M | } |
40 | | |
41 | | size_t CircularBuffer::used_space() const |
42 | 72.5M | { |
43 | 72.5M | return m_used_space; |
44 | 72.5M | } |
45 | | |
46 | | size_t CircularBuffer::capacity() const |
47 | 5.94G | { |
48 | 5.94G | return m_buffer.size(); |
49 | 5.94G | } |
50 | | |
51 | | size_t CircularBuffer::seekback_limit() const |
52 | 2.62M | { |
53 | 2.62M | return m_seekback_limit; |
54 | 2.62M | } |
55 | | |
56 | | size_t SearchableCircularBuffer::search_limit() const |
57 | 281M | { |
58 | 281M | return m_seekback_limit - m_used_space; |
59 | 281M | } |
60 | | |
61 | | bool CircularBuffer::is_wrapping_around() const |
62 | 560M | { |
63 | 560M | return capacity() <= m_reading_head + m_used_space; |
64 | 560M | } |
65 | | |
66 | | Optional<size_t> CircularBuffer::offset_of(StringView needle, Optional<size_t> from, Optional<size_t> until) const |
67 | 0 | { |
68 | 0 | auto const read_from = from.value_or(0); |
69 | 0 | auto const read_until = until.value_or(m_used_space); |
70 | 0 | VERIFY(read_from <= read_until); |
71 | | |
72 | 0 | Array<ReadonlyBytes, 2> spans {}; |
73 | 0 | spans[0] = next_read_span(); |
74 | 0 | auto const original_span_0_size = spans[0].size(); |
75 | |
|
76 | 0 | if (read_from > 0) |
77 | 0 | spans[0] = spans[0].slice(min(spans[0].size(), read_from)); |
78 | |
|
79 | 0 | if (spans[0].size() + read_from > read_until) |
80 | 0 | spans[0] = spans[0].trim(read_until - read_from); |
81 | 0 | else if (is_wrapping_around()) |
82 | 0 | spans[1] = m_buffer.span().slice(max(original_span_0_size, read_from) - original_span_0_size, min(read_until, m_used_space) - original_span_0_size); |
83 | |
|
84 | 0 | auto maybe_found = AK::memmem(spans.begin(), spans.end(), needle.bytes()); |
85 | 0 | if (maybe_found.has_value()) |
86 | 0 | *maybe_found += read_from; |
87 | |
|
88 | 0 | return maybe_found; |
89 | 0 | } |
90 | | |
91 | | void CircularBuffer::clear() |
92 | 0 | { |
93 | 0 | m_reading_head = 0; |
94 | 0 | m_used_space = 0; |
95 | 0 | m_seekback_limit = 0; |
96 | 0 | } |
97 | | |
98 | | Bytes CircularBuffer::next_write_span() |
99 | 560M | { |
100 | 560M | if (is_wrapping_around()) |
101 | 17.0M | return m_buffer.span().slice(m_reading_head + m_used_space - capacity(), capacity() - m_used_space); |
102 | 543M | return m_buffer.span().slice(m_reading_head + m_used_space, capacity() - (m_reading_head + m_used_space)); |
103 | 560M | } |
104 | | |
105 | | ReadonlyBytes CircularBuffer::next_read_span(size_t offset) const |
106 | 367M | { |
107 | 367M | auto reading_head = m_reading_head; |
108 | 367M | auto used_space = m_used_space; |
109 | | |
110 | 367M | if (offset > 0) { |
111 | 50.4M | if (offset >= used_space) |
112 | 0 | return Bytes {}; |
113 | | |
114 | 50.4M | reading_head = (reading_head + offset) % capacity(); |
115 | 50.4M | used_space -= offset; |
116 | 50.4M | } |
117 | | |
118 | 367M | return m_buffer.span().slice(reading_head, min(capacity() - reading_head, used_space)); |
119 | 367M | } |
120 | | |
121 | | ReadonlyBytes CircularBuffer::next_seekback_span(size_t distance) const |
122 | 532M | { |
123 | 532M | VERIFY(m_seekback_limit <= capacity()); |
124 | 532M | VERIFY(distance <= m_seekback_limit); |
125 | | |
126 | | // Note: We are adding the capacity once here to ensure that we can wrap around the negative space by using modulo. |
127 | 532M | auto read_offset = (capacity() + m_reading_head + m_used_space - distance) % capacity(); |
128 | | |
129 | 532M | return m_buffer.span().slice(read_offset, min(capacity() - read_offset, distance)); |
130 | 532M | } |
131 | | |
132 | | ReadonlyBytes SearchableCircularBuffer::next_search_span(size_t distance) const |
133 | 109M | { |
134 | 109M | VERIFY(search_limit() <= capacity()); |
135 | 109M | VERIFY(distance <= search_limit()); |
136 | | |
137 | | // Note: We are adding the capacity once here to ensure that we can wrap around the negative space by using modulo. |
138 | 109M | auto read_offset = (capacity() + m_reading_head - distance) % capacity(); |
139 | | |
140 | 109M | return m_buffer.span().slice(read_offset, min(capacity() - read_offset, distance)); |
141 | 109M | } |
142 | | |
143 | | size_t CircularBuffer::write(ReadonlyBytes bytes) |
144 | 559M | { |
145 | 559M | auto remaining = bytes.size(); |
146 | | |
147 | 1.11G | while (remaining > 0) { |
148 | 560M | auto const next_span = next_write_span(); |
149 | 560M | if (next_span.size() == 0) |
150 | 208k | break; |
151 | | |
152 | 559M | auto const written_bytes = bytes.slice(bytes.size() - remaining).copy_trimmed_to(next_span); |
153 | | |
154 | 559M | m_used_space += written_bytes; |
155 | | |
156 | 559M | m_seekback_limit += written_bytes; |
157 | 559M | if (m_seekback_limit > capacity()) |
158 | 529M | m_seekback_limit = capacity(); |
159 | | |
160 | 559M | remaining -= written_bytes; |
161 | 559M | } |
162 | | |
163 | 559M | return bytes.size() - remaining; |
164 | 559M | } |
165 | | |
166 | | Bytes CircularBuffer::read(Bytes bytes) |
167 | 132M | { |
168 | 132M | auto remaining = bytes.size(); |
169 | | |
170 | 264M | while (remaining > 0) { |
171 | 259M | auto const next_span = next_read_span(); |
172 | 259M | if (next_span.size() == 0) |
173 | 127M | break; |
174 | | |
175 | 132M | auto written_bytes = next_span.copy_trimmed_to(bytes.slice(bytes.size() - remaining)); |
176 | | |
177 | 132M | m_used_space -= written_bytes; |
178 | 132M | m_reading_head += written_bytes; |
179 | | |
180 | 132M | if (m_reading_head >= capacity()) |
181 | 627k | m_reading_head -= capacity(); |
182 | | |
183 | 132M | remaining -= written_bytes; |
184 | 132M | } |
185 | | |
186 | 132M | return bytes.trim(bytes.size() - remaining); |
187 | 132M | } |
188 | | |
189 | | ErrorOr<Bytes> CircularBuffer::read_with_seekback(Bytes bytes, size_t distance) const |
190 | 50.2M | { |
191 | 50.2M | if (distance > m_seekback_limit) |
192 | 0 | return Error::from_string_literal("Tried a seekback read beyond the seekback limit"); |
193 | | |
194 | 50.2M | auto remaining = bytes.size(); |
195 | | |
196 | 100M | while (remaining > 0) { |
197 | 50.2M | auto const next_span = next_seekback_span(distance); |
198 | 50.2M | if (next_span.size() == 0) |
199 | 0 | break; |
200 | | |
201 | 50.2M | auto written_bytes = next_span.copy_trimmed_to(bytes.slice(bytes.size() - remaining)); |
202 | | |
203 | 50.2M | distance -= written_bytes; |
204 | 50.2M | remaining -= written_bytes; |
205 | 50.2M | } |
206 | | |
207 | 50.2M | return bytes.trim(bytes.size() - remaining); |
208 | 50.2M | } |
209 | | |
210 | | ErrorOr<void> CircularBuffer::discard(size_t discarding_size) |
211 | 1.94M | { |
212 | 1.94M | if (m_used_space < discarding_size) |
213 | 0 | return Error::from_string_literal("Can not discard more data than what the buffer contains"); |
214 | 1.94M | m_used_space -= discarding_size; |
215 | 1.94M | m_reading_head = (m_reading_head + discarding_size) % capacity(); |
216 | | |
217 | 1.94M | return {}; |
218 | 1.94M | } |
219 | | |
220 | | ErrorOr<size_t> CircularBuffer::fill_from_stream(Stream& stream) |
221 | 0 | { |
222 | 0 | auto next_span = next_write_span(); |
223 | 0 | if (next_span.size() == 0) |
224 | 0 | return 0; |
225 | | |
226 | 0 | auto bytes = TRY(stream.read_some(next_span)); |
227 | 0 | m_used_space += bytes.size(); |
228 | |
|
229 | 0 | m_seekback_limit += bytes.size(); |
230 | 0 | if (m_seekback_limit > capacity()) |
231 | 0 | m_seekback_limit = capacity(); |
232 | |
|
233 | 0 | return bytes.size(); |
234 | 0 | } |
235 | | |
236 | | ErrorOr<size_t> CircularBuffer::flush_to_stream(Stream& stream) |
237 | 0 | { |
238 | 0 | auto next_span = next_read_span(); |
239 | 0 | if (next_span.size() == 0) |
240 | 0 | return 0; |
241 | | |
242 | 0 | auto written_bytes = TRY(stream.write_some(next_span)); |
243 | |
|
244 | 0 | m_used_space -= written_bytes; |
245 | 0 | m_reading_head += written_bytes; |
246 | |
|
247 | 0 | if (m_reading_head >= capacity()) |
248 | 0 | m_reading_head -= capacity(); |
249 | |
|
250 | 0 | return written_bytes; |
251 | 0 | } |
252 | | |
253 | | ErrorOr<size_t> CircularBuffer::copy_from_seekback(size_t distance, size_t length) |
254 | 65.0M | { |
255 | 65.0M | if (distance > m_seekback_limit) |
256 | 337 | return Error::from_string_literal("Tried a seekback copy beyond the seekback limit"); |
257 | | |
258 | 65.0M | auto remaining_length = length; |
259 | 547M | while (remaining_length > 0) { |
260 | 482M | if (empty_space() == 0) |
261 | 212k | break; |
262 | | |
263 | 482M | auto next_span = next_seekback_span(distance); |
264 | 482M | if (next_span.size() == 0) |
265 | 0 | break; |
266 | | |
267 | 482M | auto length_written = write(next_span.trim(remaining_length)); |
268 | 482M | remaining_length -= length_written; |
269 | | |
270 | | // If we copied right from the end of the seekback area (i.e. our length is larger than the distance) |
271 | | // and the last copy was one complete "chunk", we can now double the distance to copy twice as much data in one go. |
272 | 482M | if (remaining_length > distance && length_written == distance) |
273 | 359M | distance *= 2; |
274 | 482M | } |
275 | | |
276 | 65.0M | return length - remaining_length; |
277 | 65.0M | } |
278 | | |
279 | | SearchableCircularBuffer::SearchableCircularBuffer(ByteBuffer buffer) |
280 | 2.72k | : CircularBuffer(move(buffer)) |
281 | 2.72k | { |
282 | 2.72k | } |
283 | | |
284 | | ErrorOr<SearchableCircularBuffer> SearchableCircularBuffer::create_empty(size_t size) |
285 | 2.72k | { |
286 | 2.72k | auto temporary_buffer = TRY(ByteBuffer::create_uninitialized(size)); |
287 | | |
288 | 2.72k | SearchableCircularBuffer circular_buffer { move(temporary_buffer) }; |
289 | | |
290 | 2.72k | return circular_buffer; |
291 | 2.72k | } |
292 | | |
293 | | ErrorOr<SearchableCircularBuffer> SearchableCircularBuffer::create_initialized(ByteBuffer buffer) |
294 | 0 | { |
295 | 0 | SearchableCircularBuffer circular_buffer { move(buffer) }; |
296 | |
|
297 | 0 | circular_buffer.m_used_space = circular_buffer.m_buffer.size(); |
298 | |
|
299 | 0 | for (size_t i = 0; i + HASH_CHUNK_SIZE <= circular_buffer.m_buffer.size(); i++) |
300 | 0 | TRY(circular_buffer.insert_location_hash(circular_buffer.m_buffer.span().slice(i, HASH_CHUNK_SIZE), i)); |
301 | |
|
302 | 0 | return circular_buffer; |
303 | 0 | } |
304 | | |
305 | | ErrorOr<Bytes> SearchableCircularBuffer::read(Bytes bytes) |
306 | 728k | { |
307 | 728k | auto read_bytes_span = CircularBuffer::read(bytes); |
308 | 728k | TRY(hash_new_bytes()); |
309 | 728k | return read_bytes_span; |
310 | 728k | } |
311 | | |
312 | | ErrorOr<void> SearchableCircularBuffer::discard(size_t discarded_bytes) |
313 | 1.94M | { |
314 | 1.94M | TRY(CircularBuffer::discard(discarded_bytes)); |
315 | 1.94M | TRY(hash_new_bytes()); |
316 | 1.94M | return {}; |
317 | 1.94M | } |
318 | | |
319 | | ErrorOr<size_t> SearchableCircularBuffer::flush_to_stream(Stream& stream) |
320 | 0 | { |
321 | 0 | auto flushed_byte_count = TRY(CircularBuffer::flush_to_stream(stream)); |
322 | 0 | TRY(hash_new_bytes()); |
323 | 0 | return flushed_byte_count; |
324 | 0 | } |
325 | | |
326 | | Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) |
327 | 1.54M | { |
328 | 1.54M | VERIFY(minimum_length > 0); |
329 | | |
330 | | // Clip the maximum length to the amount of data that we actually store. |
331 | 1.54M | if (maximum_length > m_used_space) |
332 | 0 | maximum_length = m_used_space; |
333 | | |
334 | 1.54M | if (maximum_length < minimum_length) |
335 | 1.62k | return {}; |
336 | | |
337 | 1.53M | Optional<Match> best_match; |
338 | | |
339 | 1.53M | Array<u8, HASH_CHUNK_SIZE> needle_storage; |
340 | 1.53M | auto needle = needle_storage.span().trim(min(HASH_CHUNK_SIZE, maximum_length)); |
341 | | |
342 | 1.53M | { |
343 | 1.53M | auto needle_read_bytes = MUST(read_with_seekback(needle, used_space())); |
344 | 1.53M | VERIFY(needle_read_bytes.size() == needle.size()); |
345 | 1.53M | } |
346 | | |
347 | | // Try an efficient hash-based search first. |
348 | 1.53M | if (needle.size() >= HASH_CHUNK_SIZE) { |
349 | 1.53M | auto needle_hash = StringView { needle }.hash(); |
350 | | |
351 | 1.53M | auto maybe_starting_offset = m_hash_location_map.get(needle_hash); |
352 | | |
353 | 1.53M | if (maybe_starting_offset.has_value()) { |
354 | 692k | Optional<size_t> previous_buffer_offset; |
355 | 692k | auto current_buffer_offset = maybe_starting_offset.value(); |
356 | | |
357 | 46.7M | while (true) { |
358 | 46.7M | auto current_search_offset = (capacity() + m_reading_head - current_buffer_offset) % capacity(); |
359 | | |
360 | | // Validate the hash. In case it is invalid, we can discard the rest of the chain, as the data (and everything older) got updated. |
361 | 46.7M | Array<u8, HASH_CHUNK_SIZE> hash_chunk_at_offset; |
362 | 46.7M | auto hash_chunk_at_offset_span = MUST(read_with_seekback(hash_chunk_at_offset, current_search_offset + used_space())); |
363 | 46.7M | VERIFY(hash_chunk_at_offset_span.size() == HASH_CHUNK_SIZE); |
364 | 46.7M | auto found_chunk_hash = StringView { hash_chunk_at_offset }.hash(); |
365 | 46.7M | if (needle_hash != found_chunk_hash) { |
366 | 0 | if (!previous_buffer_offset.has_value()) |
367 | 0 | m_hash_location_map.remove(needle_hash); |
368 | 0 | else |
369 | 0 | m_location_chain_map.remove(*previous_buffer_offset); |
370 | 0 | break; |
371 | 0 | } |
372 | | |
373 | | // Validate the match through the set-distance-based implementation. |
374 | 46.7M | auto maybe_new_match = find_copy_in_seekback(Array { current_search_offset }, maximum_length, HASH_CHUNK_SIZE); |
375 | | |
376 | | // If we found a match, record it. |
377 | | // If we haven't found a match, we simply got a hash collision, so skip. |
378 | 46.7M | if (maybe_new_match.has_value()) { |
379 | 46.6M | auto new_match = maybe_new_match.release_value(); |
380 | | |
381 | 46.6M | if (!best_match.has_value() || best_match->length < new_match.length) { |
382 | 1.41M | best_match = new_match; |
383 | | |
384 | | // If we already found a result with the best possible length, then stop searching. |
385 | 1.41M | if (best_match->length >= maximum_length) |
386 | 7.66k | break; |
387 | 1.41M | } |
388 | 46.6M | } |
389 | | |
390 | | // Get the next location with the same hash from the location chain. |
391 | 46.6M | auto maybe_next_buffer_offset = m_location_chain_map.get(current_buffer_offset); |
392 | | |
393 | | // End of the chain, nothing more to check. |
394 | 46.6M | if (!maybe_next_buffer_offset.has_value()) |
395 | 685k | break; |
396 | | |
397 | 46.0M | previous_buffer_offset = current_buffer_offset; |
398 | 46.0M | current_buffer_offset = maybe_next_buffer_offset.release_value(); |
399 | 46.0M | } |
400 | | |
401 | | // If we found a match, return it now. |
402 | 692k | if (best_match.has_value()) |
403 | 692k | return best_match; |
404 | 692k | } |
405 | 1.53M | } |
406 | | |
407 | | // Needle size needs to be limited, otherwise we can only find HASH_CHUNK_SIZE-sized matches using memory search. |
408 | 846k | needle = needle.trim(minimum_length); |
409 | | |
410 | | // Try a plain memory search for smaller values. |
411 | | // Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation. |
412 | 846k | if (minimum_length <= HASH_CHUNK_SIZE) { |
413 | 846k | size_t haystack_offset_from_end = 0; |
414 | 846k | Vector<ReadonlyBytes, 2> haystack; |
415 | | |
416 | | // Note: memmem_reverse expects memory chunks in the order that it should search in, |
417 | | // so haystack[0] needs to be the memory with the highest match priority. |
418 | 846k | haystack.append(next_search_span(search_limit())); |
419 | 846k | if (haystack[0].size() < search_limit()) |
420 | 0 | haystack.prepend(next_search_span(search_limit() - haystack[0].size())); |
421 | | |
422 | 846k | auto memmem_match = AK::memmem_reverse(haystack.begin(), haystack.end(), needle); |
423 | 2.61M | while (memmem_match.has_value()) { |
424 | 1.77M | auto match_offset = memmem_match.release_value(); |
425 | 1.77M | auto corrected_match_distance = haystack_offset_from_end + match_offset; |
426 | | |
427 | | // Validate the match through the set-distance-based implementation and extend it to the largest size possible. |
428 | 1.77M | auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length); |
429 | | |
430 | | // If we weren't able to validate the match at all, either our memmem search returned garbage or our validation function is incorrect. Investigate. |
431 | 1.77M | VERIFY(maybe_new_match.has_value()); |
432 | | |
433 | 1.77M | auto new_match = maybe_new_match.release_value(); |
434 | | |
435 | 1.77M | if (!best_match.has_value() || best_match->length < new_match.length) { |
436 | 119k | best_match = new_match; |
437 | | |
438 | | // If we already found a result with the best possible length, then stop searching. |
439 | 119k | if (best_match->length >= maximum_length) |
440 | 345 | break; |
441 | 119k | } |
442 | | |
443 | 1.77M | auto size_to_discard = match_offset + 1; |
444 | | |
445 | | // Trim away the already processed bytes from the haystack. |
446 | | // Running out of haystack to discard is fine, in this case we found a match at the largest |
447 | | // distance and therefore tried to advance past that. |
448 | 1.77M | haystack_offset_from_end += size_to_discard; |
449 | 1.77M | while (size_to_discard > 0 && haystack.size() > 0) { |
450 | 1.77M | if (haystack[0].size() <= size_to_discard) { |
451 | 2.16k | size_to_discard -= haystack[0].size(); |
452 | 2.16k | haystack.remove(0); |
453 | 1.77M | } else { |
454 | 1.77M | haystack[0] = haystack[0].slice(0, haystack[0].size() - size_to_discard); |
455 | 1.77M | break; |
456 | 1.77M | } |
457 | 1.77M | } |
458 | | |
459 | 1.77M | if (haystack.size() == 0) |
460 | 2.16k | break; |
461 | | |
462 | | // Try and find the next match. |
463 | 1.77M | memmem_match = AK::memmem_reverse(haystack.begin(), haystack.end(), needle); |
464 | 1.77M | } |
465 | | |
466 | | // If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate. |
467 | 846k | VERIFY(!best_match.has_value() || best_match->length < HASH_CHUNK_SIZE); |
468 | 846k | } |
469 | | |
470 | 846k | return best_match; |
471 | 846k | } |
472 | | |
473 | | Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(ReadonlySpan<size_t> distances, size_t maximum_length, size_t minimum_length) const |
474 | 51.1M | { |
475 | 51.1M | VERIFY(minimum_length > 0); |
476 | | |
477 | | // Clip the maximum length to the amount of data that we actually store. |
478 | 51.1M | if (maximum_length > m_used_space) |
479 | 0 | maximum_length = m_used_space; |
480 | | |
481 | 51.1M | if (maximum_length < minimum_length) |
482 | 1.62k | return Optional<Match> {}; |
483 | | |
484 | 51.1M | Optional<Match> best_match; |
485 | | |
486 | 59.1M | for (auto distance : distances) { |
487 | | // Discard distances outside the valid range. |
488 | 59.1M | if (distance > search_limit() || distance <= 0) |
489 | 10.8k | continue; |
490 | | |
491 | | // TODO: This does not yet support looping repetitions. |
492 | 59.1M | if (distance < minimum_length) |
493 | 1.48M | continue; |
494 | | |
495 | 57.6M | auto current_match_length = 0ul; |
496 | | |
497 | 108M | while (current_match_length < maximum_length) { |
498 | 108M | auto haystack = next_search_span(distance - current_match_length).trim(maximum_length - current_match_length); |
499 | 108M | auto needle = next_read_span(current_match_length).trim(maximum_length - current_match_length); |
500 | | |
501 | 108M | auto submatch_length = haystack.matching_prefix_length(needle); |
502 | | |
503 | 108M | if (submatch_length == 0) |
504 | 57.2M | break; |
505 | | |
506 | 50.8M | current_match_length += submatch_length; |
507 | 50.8M | } |
508 | | |
509 | | // Discard matches that don't reach the minimum length. |
510 | 57.6M | if (current_match_length < minimum_length) |
511 | 7.46M | continue; |
512 | | |
513 | 50.2M | if (!best_match.has_value() || best_match->length < current_match_length) |
514 | 49.6M | best_match = Match { distance, current_match_length }; |
515 | 50.2M | } |
516 | | |
517 | 51.1M | return best_match; |
518 | 51.1M | } |
519 | | |
520 | | ErrorOr<void> SearchableCircularBuffer::insert_location_hash(ReadonlyBytes value, size_t raw_offset) |
521 | 62.2M | { |
522 | 62.2M | VERIFY(value.size() == HASH_CHUNK_SIZE); |
523 | | |
524 | 62.2M | auto value_hash = StringView { value }.hash(); |
525 | | |
526 | | // Discard any old entries for this offset first. This should eliminate accidental loops by breaking the chain. |
527 | | // The actual cleanup is done on access, since we can only remove invalid references when actually walking the chain. |
528 | 62.2M | m_location_chain_map.remove(raw_offset); |
529 | | |
530 | | // Check if we have any existing entries for this hash. |
531 | | // If so, we need to add it to the location chain map instead, as we will soon replace the entry in the hash location map. |
532 | 62.2M | auto existing_entry = m_hash_location_map.get(value_hash); |
533 | | |
534 | 62.2M | if (existing_entry.has_value()) |
535 | 61.0M | TRY(m_location_chain_map.try_set(raw_offset, existing_entry.value())); |
536 | | |
537 | 62.2M | TRY(m_hash_location_map.try_set(value_hash, raw_offset)); |
538 | | |
539 | 62.2M | return {}; |
540 | 62.2M | } |
541 | | |
542 | | ErrorOr<void> SearchableCircularBuffer::hash_new_bytes() |
543 | 2.67M | { |
544 | | // Stop early if we don't have enough data overall to hash a full chunk. |
545 | 2.67M | if (search_limit() < HASH_CHUNK_SIZE) |
546 | 5.44k | return {}; |
547 | | |
548 | 2.67M | auto last_hash_head = (capacity() + m_reading_head - HASH_CHUNK_SIZE) % capacity(); |
549 | | |
550 | 5.34M | while (m_hash_head <= last_hash_head) { |
551 | 2.67M | auto recalculation_span = m_buffer.span().slice(m_hash_head, (capacity() + m_reading_head - m_hash_head) % capacity()); |
552 | | |
553 | | // If the span is smaller than a hash chunk, we need to manually craft some consecutive data to do the hashing. |
554 | 2.67M | if (recalculation_span.size() < HASH_CHUNK_SIZE) { |
555 | 0 | auto auxiliary_span = m_buffer.span().slice((m_hash_head + recalculation_span.size()) % capacity()); |
556 | | |
557 | | // Ensure that our math is correct and that both spans are "adjacent". |
558 | 0 | VERIFY(recalculation_span.data() + recalculation_span.size() == m_buffer.data() + m_buffer.size()); |
559 | 0 | VERIFY(auxiliary_span.data() == m_buffer.data()); |
560 | | |
561 | 0 | while (recalculation_span.size() > 0 && recalculation_span.size() + auxiliary_span.size() >= HASH_CHUNK_SIZE) { |
562 | 0 | Array<u8, HASH_CHUNK_SIZE> temporary_hash_chunk; |
563 | |
|
564 | 0 | auto copied_from_recalculation_span = recalculation_span.copy_to(temporary_hash_chunk); |
565 | 0 | VERIFY(copied_from_recalculation_span == recalculation_span.size()); |
566 | | |
567 | 0 | auto copied_from_auxiliary_span = auxiliary_span.copy_to(temporary_hash_chunk.span().slice(copied_from_recalculation_span)); |
568 | 0 | VERIFY(copied_from_recalculation_span + copied_from_auxiliary_span == HASH_CHUNK_SIZE); |
569 | | |
570 | 0 | TRY(insert_location_hash(temporary_hash_chunk, recalculation_span.data() - m_buffer.data())); |
571 | |
|
572 | 0 | recalculation_span = recalculation_span.slice(1); |
573 | 0 | m_hash_head++; |
574 | 0 | } |
575 | | |
576 | 0 | continue; |
577 | 0 | } |
578 | | |
579 | 64.8M | for (size_t i = 0; i + HASH_CHUNK_SIZE <= recalculation_span.size(); i++) { |
580 | 62.2M | auto value = recalculation_span.slice(i, HASH_CHUNK_SIZE); |
581 | 62.2M | auto raw_offset = value.data() - m_buffer.data(); |
582 | 62.2M | TRY(insert_location_hash(value, raw_offset)); |
583 | 62.2M | m_hash_head++; |
584 | 62.2M | } |
585 | 2.67M | } |
586 | | |
587 | 2.67M | return {}; |
588 | 2.67M | } |
589 | | |
590 | | } |