Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "context"
9 : "encoding/binary"
10 : "unsafe"
11 :
12 : "github.com/cockroachdb/errors"
13 : "github.com/cockroachdb/pebble/internal/base"
14 : "github.com/cockroachdb/pebble/internal/invariants"
15 : "github.com/cockroachdb/pebble/internal/keyspan"
16 : "github.com/cockroachdb/pebble/internal/manual"
17 : "github.com/cockroachdb/pebble/internal/rangedel"
18 : "github.com/cockroachdb/pebble/internal/rangekey"
19 : )
20 :
21 1 : func uvarintLen(v uint32) int {
22 1 : i := 0
23 1 : for v >= 0x80 {
24 0 : v >>= 7
25 0 : i++
26 0 : }
27 1 : return i + 1
28 : }
29 :
30 : type blockWriter struct {
31 : restartInterval int
32 : nEntries int
33 : nextRestart int
34 : buf []byte
35 : // For datablocks in TableFormatPebblev3, we steal the most significant bit
36 : // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
37 : // us with 31 bits, which is more than enough (no one needs > 2GB blocks).
38 : // Typically, restarts occur every 16 keys, and by storing this bit with the
39 : // restart, we can optimize for the case where a user wants to skip to the
40 : // next prefix which happens to be in the same data block, but is > 16 keys
41 : // away. We have seen production situations with 100+ versions per MVCC key
42 : // (which share the same prefix). Additionally, for such writers, the prefix
43 : // compression of the key, that shares the key with the preceding key, is
44 : // limited to the prefix part of the preceding key -- this ensures that when
45 : // doing NPrefix (see blockIter) we don't need to assemble the full key
46 : // for each step since by limiting the length of the shared key we are
47 : // ensuring that any of the keys with the same prefix can be used to
48 : // assemble the full key when the prefix does change.
49 : restarts []uint32
50 : // Do not read curKey directly from outside blockWriter since it can have
51 : // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
52 : // getCurUserKey() instead.
53 : curKey []byte
54 : // curValue excludes the optional prefix provided to
55 : // storeWithOptionalValuePrefix.
56 : curValue []byte
57 : prevKey []byte
58 : tmp [4]byte
59 : // We don't know the state of the sets that were at the end of the previous
60 : // block, so this is initially 0. It may be true for the second and later
61 : // restarts in a block. Not having inter-block information is fine since we
62 : // will optimize by stepping through restarts only within the same block.
63 : // Note that the first restart is the first key in the block.
64 : setHasSameKeyPrefixSinceLastRestart bool
65 : }
66 :
67 1 : func (w *blockWriter) clear() {
68 1 : *w = blockWriter{
69 1 : buf: w.buf[:0],
70 1 : restarts: w.restarts[:0],
71 1 : curKey: w.curKey[:0],
72 1 : curValue: w.curValue[:0],
73 1 : prevKey: w.prevKey[:0],
74 1 : }
75 1 : }
76 :
77 : // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
78 : // explicitly place this limit to reserve a few bits in the restart for
79 : // internal use.
80 : const MaximumBlockSize = 1 << 28
81 : const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
82 : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
83 : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
84 :
85 1 : func (w *blockWriter) getCurKey() InternalKey {
86 1 : k := base.DecodeInternalKey(w.curKey)
87 1 : k.Trailer = k.Trailer & trailerObsoleteMask
88 1 : return k
89 1 : }
90 :
91 1 : func (w *blockWriter) getCurUserKey() []byte {
92 1 : n := len(w.curKey) - base.InternalTrailerLen
93 1 : if n < 0 {
94 0 : panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
95 : }
96 1 : return w.curKey[:n:n]
97 : }
98 :
99 : // If !addValuePrefix, the valuePrefix is ignored.
100 : func (w *blockWriter) storeWithOptionalValuePrefix(
101 : keySize int,
102 : value []byte,
103 : maxSharedKeyLen int,
104 : addValuePrefix bool,
105 : valuePrefix valuePrefix,
106 : setHasSameKeyPrefix bool,
107 1 : ) {
108 1 : shared := 0
109 1 : if !setHasSameKeyPrefix {
110 1 : w.setHasSameKeyPrefixSinceLastRestart = false
111 1 : }
112 1 : if w.nEntries == w.nextRestart {
113 1 : w.nextRestart = w.nEntries + w.restartInterval
114 1 : restart := uint32(len(w.buf))
115 1 : if w.setHasSameKeyPrefixSinceLastRestart {
116 1 : restart = restart | setHasSameKeyPrefixRestartMask
117 1 : }
118 1 : w.setHasSameKeyPrefixSinceLastRestart = true
119 1 : w.restarts = append(w.restarts, restart)
120 1 : } else {
121 1 : // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
122 1 : // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
123 1 : // show this to not be a performance win. For now, functions that use of
124 1 : // unsafe cannot be inlined.
125 1 : n := maxSharedKeyLen
126 1 : if n > len(w.prevKey) {
127 1 : n = len(w.prevKey)
128 1 : }
129 1 : asUint64 := func(b []byte, i int) uint64 {
130 1 : return binary.LittleEndian.Uint64(b[i:])
131 1 : }
132 1 : for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
133 1 : shared += 8
134 1 : }
135 1 : for shared < n && w.curKey[shared] == w.prevKey[shared] {
136 1 : shared++
137 1 : }
138 : }
139 :
140 1 : lenValuePlusOptionalPrefix := len(value)
141 1 : if addValuePrefix {
142 1 : lenValuePlusOptionalPrefix++
143 1 : }
144 1 : needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
145 1 : n := len(w.buf)
146 1 : if cap(w.buf) < n+needed {
147 1 : newCap := 2 * cap(w.buf)
148 1 : if newCap == 0 {
149 1 : newCap = 1024
150 1 : }
151 1 : for newCap < n+needed {
152 1 : newCap *= 2
153 1 : }
154 1 : newBuf := make([]byte, n, newCap)
155 1 : copy(newBuf, w.buf)
156 1 : w.buf = newBuf
157 : }
158 1 : w.buf = w.buf[:n+needed]
159 1 :
160 1 : // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
161 1 : // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
162 1 : // show this to not be a performance win.
163 1 : {
164 1 : x := uint32(shared)
165 1 : for x >= 0x80 {
166 0 : w.buf[n] = byte(x) | 0x80
167 0 : x >>= 7
168 0 : n++
169 0 : }
170 1 : w.buf[n] = byte(x)
171 1 : n++
172 : }
173 :
174 1 : {
175 1 : x := uint32(keySize - shared)
176 1 : for x >= 0x80 {
177 0 : w.buf[n] = byte(x) | 0x80
178 0 : x >>= 7
179 0 : n++
180 0 : }
181 1 : w.buf[n] = byte(x)
182 1 : n++
183 : }
184 :
185 1 : {
186 1 : x := uint32(lenValuePlusOptionalPrefix)
187 1 : for x >= 0x80 {
188 1 : w.buf[n] = byte(x) | 0x80
189 1 : x >>= 7
190 1 : n++
191 1 : }
192 1 : w.buf[n] = byte(x)
193 1 : n++
194 : }
195 :
196 1 : n += copy(w.buf[n:], w.curKey[shared:])
197 1 : if addValuePrefix {
198 1 : w.buf[n : n+1][0] = byte(valuePrefix)
199 1 : n++
200 1 : }
201 1 : n += copy(w.buf[n:], value)
202 1 : w.buf = w.buf[:n]
203 1 :
204 1 : w.curValue = w.buf[n-len(value):]
205 1 :
206 1 : w.nEntries++
207 : }
208 :
209 1 : func (w *blockWriter) add(key InternalKey, value []byte) {
210 1 : w.addWithOptionalValuePrefix(
211 1 : key, false, value, len(key.UserKey), false, 0, false)
212 1 : }
213 :
214 : // Callers that always set addValuePrefix to false should use add() instead.
215 : //
216 : // isObsolete indicates whether this key-value pair is obsolete in this
217 : // sstable (only applicable when writing data blocks) -- see the comment in
218 : // table.go and the longer one in format.go. addValuePrefix adds a 1 byte
219 : // prefix to the value, specified in valuePrefix -- this is used for data
220 : // blocks in TableFormatPebblev3 onwards for SETs (see the comment in
221 : // format.go, with more details in value_block.go). setHasSameKeyPrefix is
222 : // also used in TableFormatPebblev3 onwards for SETs.
223 : func (w *blockWriter) addWithOptionalValuePrefix(
224 : key InternalKey,
225 : isObsolete bool,
226 : value []byte,
227 : maxSharedKeyLen int,
228 : addValuePrefix bool,
229 : valuePrefix valuePrefix,
230 : setHasSameKeyPrefix bool,
231 1 : ) {
232 1 : w.curKey, w.prevKey = w.prevKey, w.curKey
233 1 :
234 1 : size := key.Size()
235 1 : if cap(w.curKey) < size {
236 1 : w.curKey = make([]byte, 0, size*2)
237 1 : }
238 1 : w.curKey = w.curKey[:size]
239 1 : if isObsolete {
240 1 : key.Trailer = key.Trailer | trailerObsoleteBit
241 1 : }
242 1 : key.Encode(w.curKey)
243 1 :
244 1 : w.storeWithOptionalValuePrefix(
245 1 : size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
246 : }
247 :
248 1 : func (w *blockWriter) finish() []byte {
249 1 : // Write the restart points to the buffer.
250 1 : if w.nEntries == 0 {
251 1 : // Every block must have at least one restart point.
252 1 : if cap(w.restarts) > 0 {
253 1 : w.restarts = w.restarts[:1]
254 1 : w.restarts[0] = 0
255 1 : } else {
256 1 : w.restarts = append(w.restarts, 0)
257 1 : }
258 : }
259 1 : tmp4 := w.tmp[:4]
260 1 : for _, x := range w.restarts {
261 1 : binary.LittleEndian.PutUint32(tmp4, x)
262 1 : w.buf = append(w.buf, tmp4...)
263 1 : }
264 1 : binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
265 1 : w.buf = append(w.buf, tmp4...)
266 1 : result := w.buf
267 1 :
268 1 : // Reset the block state.
269 1 : w.nEntries = 0
270 1 : w.nextRestart = 0
271 1 : w.buf = w.buf[:0]
272 1 : w.restarts = w.restarts[:0]
273 1 : return result
274 : }
275 :
276 : // emptyBlockSize holds the size of an empty block. Every block ends
277 : // in a uint32 trailer encoding the number of restart points within the
278 : // block.
279 : const emptyBlockSize = 4
280 :
281 1 : func (w *blockWriter) estimatedSize() int {
282 1 : return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
283 1 : }
284 :
285 : type blockEntry struct {
286 : offset int32
287 : keyStart int32
288 : keyEnd int32
289 : valStart int32
290 : valSize int32
291 : }
292 :
293 : // blockIter is an iterator over a single block of data.
294 : //
295 : // A blockIter provides an additional guarantee around key stability when a
296 : // block has a restart interval of 1 (i.e. when there is no prefix
297 : // compression). Key stability refers to whether the InternalKey.UserKey bytes
298 : // returned by a positioning call will remain stable after a subsequent
299 : // positioning call. The normal case is that a positioning call will invalidate
300 : // any previously returned InternalKey.UserKey. If a block has a restart
301 : // interval of 1 (no prefix compression), blockIter guarantees that
302 : // InternalKey.UserKey will point to the key as stored in the block itself
303 : // which will remain valid until the blockIter is closed. The key stability
304 : // guarantee is used by the range tombstone and range key code, which knows that
305 : // the respective blocks are always encoded with a restart interval of 1. This
306 : // per-block key stability guarantee is sufficient for range tombstones and
307 : // range deletes as they are always encoded in a single block. Note: this
308 : // stability guarantee no longer holds for a block iter with synthetic suffix
309 : // replacement, but this doesn't matter, as the user will not open
310 : // an iterator with a synthetic suffix on a block with rangekeys (for now).
311 : //
312 : // A blockIter also provides a value stability guarantee for range deletions and
313 : // range keys since there is only a single range deletion and range key block
314 : // per sstable and the blockIter will not release the bytes for the block until
315 : // it is closed.
316 : //
317 : // Note on why blockIter knows about lazyValueHandling:
318 : //
319 : // blockIter's positioning functions (that return a LazyValue), are too
320 : // complex to inline even prior to lazyValueHandling. blockIter.Next and
321 : // blockIter.First were by far the cheapest and had costs 195 and 180
322 : // respectively, which exceeds the budget of 80. We initially tried to keep
323 : // the lazyValueHandling logic out of blockIter by wrapping it with a
324 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
325 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
326 : // they called the corresponding blockIter func and then decided whether the
327 : // value was in fact in-place (so return immediately) or needed further
328 : // handling. But these also turned out too costly for mid-stack inlining since
329 : // simple calls like the following have a high cost that is barely under the
330 : // budget of 80
331 : //
332 : // k, v := i.data.SeekGE(key, flags) // cost 74
333 : // k, v := i.data.Next() // cost 72
334 : //
335 : // We have 2 options for minimizing performance regressions:
336 : // - Include the lazyValueHandling logic in the already non-inlineable
337 : // blockIter functions: Since most of the time is spent in data block iters,
338 : // it is acceptable to take the small hit of unnecessary branching (which
339 : // hopefully branch prediction will predict correctly) for other kinds of
340 : // blocks.
341 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
342 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
343 : // v3 sstable. We would want to manage these copies via code generation.
344 : //
345 : // We have picked the first option here.
346 : type blockIter struct {
347 : cmp Compare
348 : split Split
349 : // offset is the byte index that marks where the current key/value is
350 : // encoded in the block.
351 : offset int32
352 : // nextOffset is the byte index where the next key/value is encoded in the
353 : // block.
354 : nextOffset int32
355 : // A "restart point" in a block is a point where the full key is encoded,
356 : // instead of just having a suffix of the key encoded. See readEntry() for
357 : // how prefix compression of keys works. Keys in between two restart points
358 : // only have a suffix encoded in the block. When restart interval is 1, no
359 : // prefix compression of keys happens. This is the case with range tombstone
360 : // blocks.
361 : //
362 : // All restart offsets are listed in increasing order in
363 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
364 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
365 : // therefore be seen as the point where data in the block ends, and a list
366 : // of offsets of all restart points begins.
367 : restarts int32
368 : // Number of restart points in this block. Encoded at the end of the block
369 : // as a uint32.
370 : numRestarts int32
371 : globalSeqNum uint64
372 : ptr unsafe.Pointer
373 : data []byte
374 : // key contains the raw key the iterator is currently pointed at. This may
375 : // point directly to data stored in the block (for a key which has no prefix
376 : // compression), to fullKey (for a prefix compressed key), or to a slice of
377 : // data stored in cachedBuf (during reverse iteration).
378 : //
379 : // NB: In general, key contains the same logical content as ikey
380 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
381 : // replacement rule, this will not be the case. Therefore, key should never
382 : // be used after ikey is set.
383 : key []byte
384 : // fullKey is a buffer used for key prefix decompression.
385 : fullKey []byte
386 : // val contains the value the iterator is currently pointed at. If non-nil,
387 : // this points to a slice of the block data.
388 : val []byte
389 : // lazyValue is val turned into a LazyValue, whenever a positioning method
390 : // returns a non-nil key-value pair.
391 : lazyValue base.LazyValue
392 : // ikey contains the decoded InternalKey the iterator is currently pointed
393 : // at. Note that the memory backing ikey.UserKey is either data stored
394 : // directly in the block, fullKey, cachedBuf, or synthSuffixBuf. The key
395 : // stability guarantee for blocks built with a restart interval of 1 is
396 : // achieved by having ikey.UserKey always point to data stored directly in the
397 : // block.
398 : ikey InternalKey
399 : // cached and cachedBuf are used during reverse iteration. They are needed
400 : // because we can't perform prefix decoding in reverse, only in the forward
401 : // direction. In order to iterate in reverse, we decode and cache the entries
402 : // between two restart points.
403 : //
404 : // Note that cached[len(cached)-1] contains the previous entry to the one the
405 : // blockIter is currently pointed at. As usual, nextOffset will contain the
406 : // offset of the next entry. During reverse iteration, nextOffset will be
407 : // updated to point to offset, and we'll set the blockIter to point at the
408 : // entry cached[len(cached)-1]. See Prev() for more details.
409 : //
410 : // For a block encoded with a restart interval of 1, cached and cachedBuf
411 : // will not be used as there are no prefix compressed entries between the
412 : // restart points.
413 : cached []blockEntry
414 : cachedBuf []byte
415 : handle bufferHandle
416 : // for block iteration for already loaded blocks.
417 : firstUserKey []byte
418 : lazyValueHandling struct {
419 : vbr *valueBlockReader
420 : hasValuePrefix bool
421 : }
422 : hideObsoletePoints bool
423 :
424 : // syntheticSuffix, if not nil, will replace the decoded ikey.UserKey suffix
425 : // before the key is returned to the user. A sequence of iter operations on a
426 : // block with a syntheticSuffix rule should return keys as if those operations
427 : // ran on a block with keys that all had the syntheticSuffix. As an example:
428 : // any sequence of block iter cmds should return the same keys for the
429 : // following two blocks:
430 : //
431 : // blockA: a@3,b@3,c@3
432 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
433 : //
434 : // To ensure this, Suffix replacement will not change the ordering of keys in
435 : // the block because the iter assumes that no two keys in the block share the
436 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
437 : // iterator handles "off by one" errors (explained in more detail in those
438 : // functions) when, for a given key, originalSuffix < searchSuffix <
439 : // replacementSuffix, with integer comparison. To handle these cases, the
440 : // iterator assumes:
441 : //
442 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
443 : // for keys with a suffix.
444 : //
445 : // NB: it is possible for a block iter to add a synthetic suffix on a key
446 : // without a suffix, which implies
447 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
448 : // however, the iterator would never need to handle an off by one error in
449 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
450 : // integer comparison.
451 : //
452 : //
453 : // In addition, we also assume that any block with rangekeys will not contain
454 : // a synthetic suffix.
455 : syntheticSuffix SyntheticSuffix
456 : synthSuffixBuf []byte
457 : }
458 :
459 : // blockIter implements the base.InternalIterator interface.
460 : var _ base.InternalIterator = (*blockIter)(nil)
461 :
462 : func newBlockIter(
463 : cmp Compare, split Split, block block, syntheticSuffix SyntheticSuffix,
464 1 : ) (*blockIter, error) {
465 1 : i := &blockIter{}
466 1 : return i, i.init(cmp, split, block, 0, false, syntheticSuffix)
467 1 : }
468 :
469 0 : func (i *blockIter) String() string {
470 0 : return "block"
471 0 : }
472 :
473 : func (i *blockIter) init(
474 : cmp Compare,
475 : split Split,
476 : block block,
477 : globalSeqNum uint64,
478 : hideObsoletePoints bool,
479 : syntheticSuffix SyntheticSuffix,
480 1 : ) error {
481 1 : numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
482 1 : if numRestarts == 0 {
483 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
484 0 : }
485 1 : i.syntheticSuffix = syntheticSuffix
486 1 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
487 1 : i.split = split
488 1 : i.cmp = cmp
489 1 : i.restarts = int32(len(block)) - 4*(1+numRestarts)
490 1 : i.numRestarts = numRestarts
491 1 : i.globalSeqNum = globalSeqNum
492 1 : i.ptr = unsafe.Pointer(&block[0])
493 1 : i.data = block
494 1 : i.fullKey = i.fullKey[:0]
495 1 : i.val = nil
496 1 : i.hideObsoletePoints = hideObsoletePoints
497 1 : i.clearCache()
498 1 : if i.restarts > 0 {
499 1 : if err := i.readFirstKey(); err != nil {
500 0 : return err
501 0 : }
502 1 : } else {
503 1 : // Block is empty.
504 1 : i.firstUserKey = nil
505 1 : }
506 1 : return nil
507 : }
508 :
509 : // NB: two cases of hideObsoletePoints:
510 : // - Local sstable iteration: globalSeqNum will be set iff the sstable was
511 : // ingested.
512 : // - Foreign sstable iteration: globalSeqNum is always set.
513 : func (i *blockIter) initHandle(
514 : cmp Compare,
515 : split Split,
516 : block bufferHandle,
517 : globalSeqNum uint64,
518 : hideObsoletePoints bool,
519 : syntheticSuffix SyntheticSuffix,
520 1 : ) error {
521 1 : i.handle.Release()
522 1 : i.handle = block
523 1 : return i.init(cmp, split, block.Get(), globalSeqNum, hideObsoletePoints, syntheticSuffix)
524 1 : }
525 :
526 1 : func (i *blockIter) invalidate() {
527 1 : i.clearCache()
528 1 : i.offset = 0
529 1 : i.nextOffset = 0
530 1 : i.restarts = 0
531 1 : i.numRestarts = 0
532 1 : i.data = nil
533 1 : }
534 :
535 : // isDataInvalidated returns true when the blockIter has been invalidated
536 : // using an invalidate call. NB: this is different from blockIter.Valid
537 : // which is part of the InternalIterator implementation.
538 1 : func (i *blockIter) isDataInvalidated() bool {
539 1 : return i.data == nil
540 1 : }
541 :
542 1 : func (i *blockIter) resetForReuse() blockIter {
543 1 : return blockIter{
544 1 : fullKey: i.fullKey[:0],
545 1 : cached: i.cached[:0],
546 1 : cachedBuf: i.cachedBuf[:0],
547 1 : data: nil,
548 1 : }
549 1 : }
550 :
551 1 : func (i *blockIter) readEntry() {
552 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
553 1 :
554 1 : // This is an ugly performance hack. Reading entries from blocks is one of
555 1 : // the inner-most routines and decoding the 3 varints per-entry takes
556 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
557 1 : // us, so we do it manually. This provides a 10-15% performance improvement
558 1 : // on blockIter benchmarks on both go1.11 and go1.12.
559 1 : //
560 1 : // TODO(peter): remove this hack if go:inline is ever supported.
561 1 :
562 1 : var shared uint32
563 1 : if a := *((*uint8)(ptr)); a < 128 {
564 1 : shared = uint32(a)
565 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
566 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
567 0 : shared = uint32(b)<<7 | uint32(a)
568 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
569 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
570 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
571 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
572 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
573 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
574 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
575 0 : } else {
576 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
577 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
578 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
579 0 : }
580 :
581 1 : var unshared uint32
582 1 : if a := *((*uint8)(ptr)); a < 128 {
583 1 : unshared = uint32(a)
584 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
585 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
586 0 : unshared = uint32(b)<<7 | uint32(a)
587 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
588 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
589 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
590 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
591 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
592 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
593 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
594 0 : } else {
595 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
596 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
597 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
598 0 : }
599 :
600 1 : var value uint32
601 1 : if a := *((*uint8)(ptr)); a < 128 {
602 1 : value = uint32(a)
603 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
604 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
605 1 : value = uint32(b)<<7 | uint32(a)
606 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
607 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
608 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
609 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
610 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
611 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
612 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
613 0 : } else {
614 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
615 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
616 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
617 0 : }
618 :
619 1 : unsharedKey := getBytes(ptr, int(unshared))
620 1 : // TODO(sumeer): move this into the else block below.
621 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
622 1 : if shared == 0 {
623 1 : // Provide stability for the key across positioning calls if the key
624 1 : // doesn't share a prefix with the previous key. This removes requiring the
625 1 : // key to be copied if the caller knows the block has a restart interval of
626 1 : // 1. An important example of this is range-del blocks.
627 1 : i.key = unsharedKey
628 1 : } else {
629 1 : i.key = i.fullKey
630 1 : }
631 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
632 1 : i.val = getBytes(ptr, int(value))
633 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
634 : }
635 :
636 1 : func (i *blockIter) readFirstKey() error {
637 1 : ptr := i.ptr
638 1 :
639 1 : // This is an ugly performance hack. Reading entries from blocks is one of
640 1 : // the inner-most routines and decoding the 3 varints per-entry takes
641 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
642 1 : // us, so we do it manually. This provides a 10-15% performance improvement
643 1 : // on blockIter benchmarks on both go1.11 and go1.12.
644 1 : //
645 1 : // TODO(peter): remove this hack if go:inline is ever supported.
646 1 :
647 1 : if shared := *((*uint8)(ptr)); shared == 0 {
648 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
649 1 : } else {
650 0 : // The shared length is != 0, which is invalid.
651 0 : panic("first key in block must have zero shared length")
652 : }
653 :
654 1 : var unshared uint32
655 1 : if a := *((*uint8)(ptr)); a < 128 {
656 1 : unshared = uint32(a)
657 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
658 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
659 0 : unshared = uint32(b)<<7 | uint32(a)
660 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
661 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
662 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
663 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
664 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
665 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
666 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
667 0 : } else {
668 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
669 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
670 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
671 0 : }
672 :
673 : // Skip the value length.
674 1 : if a := *((*uint8)(ptr)); a < 128 {
675 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
676 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
677 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
678 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
679 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
680 0 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
681 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
682 0 : } else {
683 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
684 0 : }
685 :
686 1 : firstKey := getBytes(ptr, int(unshared))
687 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
688 1 : // BlockIter benchmarks.
689 1 : if n := len(firstKey) - 8; n >= 0 {
690 1 : i.firstUserKey = firstKey[:n:n]
691 1 : } else {
692 0 : i.firstUserKey = nil
693 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
694 0 : }
695 1 : return nil
696 : }
697 :
698 : // The sstable internal obsolete bit is set when writing a block and unset by
699 : // blockIter, so no code outside block writing/reading code ever sees it.
700 : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
701 : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
702 :
703 1 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
704 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
705 1 : // BlockIter benchmarks.
706 1 : if n := len(key) - 8; n >= 0 {
707 1 : trailer := binary.LittleEndian.Uint64(key[n:])
708 1 : hiddenPoint = i.hideObsoletePoints &&
709 1 : (trailer&trailerObsoleteBit != 0)
710 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
711 1 : i.ikey.UserKey = key[:n:n]
712 1 : if i.globalSeqNum != 0 {
713 1 : i.ikey.SetSeqNum(i.globalSeqNum)
714 1 : }
715 1 : } else {
716 1 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
717 1 : i.ikey.UserKey = nil
718 1 : }
719 1 : return hiddenPoint
720 : }
721 :
722 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
723 : // i.syntheticSuffix. allowInPlace is set to false if there's a chance that
724 : // i.ikey.UserKey points to the same buffer as i.cachedBuf (i.e. during reverse
725 : // iteration).
726 1 : func (i *blockIter) maybeReplaceSuffix(allowInPlace bool) {
727 1 : if i.syntheticSuffix != nil && i.ikey.UserKey != nil {
728 1 : prefixLen := i.split(i.ikey.UserKey)
729 1 : if allowInPlace && cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
730 1 : i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
731 1 : return
732 1 : }
733 : // If ikey is cached or may get cached, we must copy
734 : // UserKey to a new buffer before prefix replacement.
735 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
736 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
737 1 : i.ikey.UserKey = i.synthSuffixBuf
738 : }
739 : }
740 :
741 1 : func (i *blockIter) clearCache() {
742 1 : i.cached = i.cached[:0]
743 1 : i.cachedBuf = i.cachedBuf[:0]
744 1 : }
745 :
746 1 : func (i *blockIter) cacheEntry() {
747 1 : var valStart int32
748 1 : valSize := int32(len(i.val))
749 1 : if valSize > 0 {
750 1 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
751 1 : }
752 :
753 1 : i.cached = append(i.cached, blockEntry{
754 1 : offset: i.offset,
755 1 : keyStart: int32(len(i.cachedBuf)),
756 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
757 1 : valStart: valStart,
758 1 : valSize: valSize,
759 1 : })
760 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
761 : }
762 :
763 1 : func (i *blockIter) getFirstUserKey() []byte {
764 1 : return i.firstUserKey
765 1 : }
766 :
767 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
768 : // package.
769 1 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
770 1 : if invariants.Enabled && i.isDataInvalidated() {
771 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
772 : }
773 :
774 1 : i.clearCache()
775 1 : // Find the index of the smallest restart point whose key is > the key
776 1 : // sought; index will be numRestarts if there is no such restart point.
777 1 : i.offset = 0
778 1 : var index int32
779 1 :
780 1 : {
781 1 : // NB: manually inlined sort.Seach is ~5% faster.
782 1 : //
783 1 : // Define f(-1) == false and f(n) == true.
784 1 : // Invariant: f(index-1) == false, f(upper) == true.
785 1 : upper := i.numRestarts
786 1 : for index < upper {
787 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
788 1 : // index ≤ h < upper
789 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
790 1 : // For a restart point, there are 0 bytes shared with the previous key.
791 1 : // The varint encoding of 0 occupies 1 byte.
792 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
793 1 :
794 1 : // Decode the key at that restart point, and compare it to the key
795 1 : // sought. See the comment in readEntry for why we manually inline the
796 1 : // varint decoding.
797 1 : var v1 uint32
798 1 : if a := *((*uint8)(ptr)); a < 128 {
799 1 : v1 = uint32(a)
800 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
801 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
802 0 : v1 = uint32(b)<<7 | uint32(a)
803 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
804 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
805 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
806 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
807 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
808 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
809 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
810 0 : } else {
811 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
812 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
813 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
814 0 : }
815 :
816 1 : if *((*uint8)(ptr)) < 128 {
817 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
818 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
819 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
820 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
821 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
822 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
823 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
824 0 : } else {
825 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
826 0 : }
827 :
828 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
829 : // speedup on BlockIter benchmarks.
830 1 : s := getBytes(ptr, int(v1))
831 1 : var k []byte
832 1 : if n := len(s) - 8; n >= 0 {
833 1 : k = s[:n:n]
834 1 : }
835 : // Else k is invalid, and left as nil
836 :
837 1 : if i.cmp(key, k) > 0 {
838 1 : // The search key is greater than the user key at this restart point.
839 1 : // Search beyond this restart point, since we are trying to find the
840 1 : // first restart point with a user key >= the search key.
841 1 : index = h + 1 // preserves f(i-1) == false
842 1 : } else {
843 1 : // k >= search key, so prune everything after index (since index
844 1 : // satisfies the property we are looking for).
845 1 : upper = h // preserves f(j) == true
846 1 : }
847 : }
848 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
849 : // => answer is index.
850 : }
851 :
852 : // index is the first restart point with key >= search key. Define the keys
853 : // between a restart point and the next restart point as belonging to that
854 : // restart point.
855 : //
856 : // Since keys are strictly increasing, if index > 0 then the restart point
857 : // at index-1 will be the first one that has some keys belonging to it that
858 : // could be equal to the search key. If index == 0, then all keys in this
859 : // block are larger than the key sought, and offset remains at zero.
860 1 : if index > 0 {
861 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
862 1 : }
863 1 : i.readEntry()
864 1 : hiddenPoint := i.decodeInternalKey(i.key)
865 1 :
866 1 : // Iterate from that restart point to somewhere >= the key sought.
867 1 : if !i.valid() {
868 0 : return nil, base.LazyValue{}
869 0 : }
870 :
871 : // A note on seeking in a block with a suffix replacement rule: even though
872 : // the binary search above was conducted on keys without suffix replacement,
873 : // Seek will still return the correct suffix replaced key. A binary
874 : // search without suffix replacement will land on a key that is _less_ than
875 : // the key the search would have landed on if all keys were already suffix
876 : // replaced. Since Seek then conducts forward iteration to the first suffix
877 : // replaced user key that is greater than or equal to the search key, the
878 : // correct key is still returned.
879 : //
880 : // As an example, consider the following block with a restart interval of 1,
881 : // with a replacement suffix of "4":
882 : // - Pre-suffix replacement: apple@1, banana@3
883 : // - Post-suffix replacement: apple@4, banana@4
884 : //
885 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
886 : // chronological order (i.e. apple@1>apple@3), the binary search without
887 : // suffix replacement would return apple@1. A binary search with suffix
888 : // replacement would return banana@4. After beginning forward iteration from
889 : // either returned restart point, forward iteration would
890 : // always return the correct key, banana@4.
891 : //
892 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
893 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
894 : // replacement suffix), the binary search with or without suffix replacement
895 : // would land on the same key, as we assume the following:
896 : // (1) no two keys in the sst share the same prefix.
897 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
898 :
899 1 : i.maybeReplaceSuffix(true /*allowInPlace*/)
900 1 :
901 1 : if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
902 1 : // Initialize i.lazyValue
903 1 : if !i.lazyValueHandling.hasValuePrefix ||
904 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
905 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
906 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
907 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
908 1 : } else {
909 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
910 1 : }
911 1 : return &i.ikey, i.lazyValue
912 : }
913 1 : for i.Next(); i.valid(); i.Next() {
914 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
915 1 : // i.Next() has already initialized i.lazyValue.
916 1 : return &i.ikey, i.lazyValue
917 1 : }
918 : }
919 1 : return nil, base.LazyValue{}
920 : }
921 :
922 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
923 : // pebble package.
924 : func (i *blockIter) SeekPrefixGE(
925 : prefix, key []byte, flags base.SeekGEFlags,
926 0 : ) (*base.InternalKey, base.LazyValue) {
927 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
928 0 : panic("pebble: SeekPrefixGE unimplemented")
929 : }
930 :
931 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
932 : // package.
933 1 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
934 1 : if invariants.Enabled && i.isDataInvalidated() {
935 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
936 : }
937 :
938 1 : i.clearCache()
939 1 : // Find the index of the smallest restart point whose key is >= the key
940 1 : // sought; index will be numRestarts if there is no such restart point.
941 1 : i.offset = 0
942 1 : var index int32
943 1 :
944 1 : {
945 1 : // NB: manually inlined sort.Search is ~5% faster.
946 1 : //
947 1 : // Define f(-1) == false and f(n) == true.
948 1 : // Invariant: f(index-1) == false, f(upper) == true.
949 1 : upper := i.numRestarts
950 1 : for index < upper {
951 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
952 1 : // index ≤ h < upper
953 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
954 1 : // For a restart point, there are 0 bytes shared with the previous key.
955 1 : // The varint encoding of 0 occupies 1 byte.
956 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
957 1 :
958 1 : // Decode the key at that restart point, and compare it to the key
959 1 : // sought. See the comment in readEntry for why we manually inline the
960 1 : // varint decoding.
961 1 : var v1 uint32
962 1 : if a := *((*uint8)(ptr)); a < 128 {
963 1 : v1 = uint32(a)
964 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
965 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
966 0 : v1 = uint32(b)<<7 | uint32(a)
967 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
968 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
969 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
970 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
971 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
972 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
973 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
974 0 : } else {
975 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
976 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
977 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
978 0 : }
979 :
980 1 : if *((*uint8)(ptr)) < 128 {
981 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
982 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
983 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
984 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
985 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
986 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
987 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
988 0 : } else {
989 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
990 0 : }
991 :
992 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
993 : // speedup on BlockIter benchmarks.
994 1 : s := getBytes(ptr, int(v1))
995 1 : var k []byte
996 1 : if n := len(s) - 8; n >= 0 {
997 1 : k = s[:n:n]
998 1 : }
999 : // Else k is invalid, and left as nil
1000 :
1001 1 : if i.cmp(key, k) > 0 {
1002 1 : // The search key is greater than the user key at this restart point.
1003 1 : // Search beyond this restart point, since we are trying to find the
1004 1 : // first restart point with a user key >= the search key.
1005 1 : index = h + 1 // preserves f(i-1) == false
1006 1 : } else {
1007 1 : // k >= search key, so prune everything after index (since index
1008 1 : // satisfies the property we are looking for).
1009 1 : upper = h // preserves f(j) == true
1010 1 : }
1011 : }
1012 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1013 : // => answer is index.
1014 : }
1015 :
1016 1 : if index == 0 {
1017 1 : if i.syntheticSuffix != nil {
1018 1 : // The binary search was conducted on keys without suffix replacement,
1019 1 : // implying the first key in the block may be less than the search key. To
1020 1 : // double check, get the first key in the block with suffix replacement
1021 1 : // and compare to the search key. Consider the following example: suppose
1022 1 : // the user searches with a@3, the first key in the block is a@2 and the
1023 1 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
1024 1 : // a@2, the binary search would return index==0. Without conducting the
1025 1 : // suffix replacement, the SeekLT would incorrectly return nil. With
1026 1 : // suffix replacement though, a@4 should be returned as a@4 sorts before
1027 1 : // a@3.
1028 1 : ikey, lazyVal := i.First()
1029 1 : if i.cmp(ikey.UserKey, key) < 0 {
1030 0 : return ikey, lazyVal
1031 0 : }
1032 : }
1033 : // If index == 0 then all keys in this block are larger than the key
1034 : // sought, so there is no match.
1035 1 : i.offset = -1
1036 1 : i.nextOffset = 0
1037 1 : return nil, base.LazyValue{}
1038 : }
1039 :
1040 : // INVARIANT: index > 0
1041 :
1042 : // Ignoring suffix replacement, index is the first restart point with key >=
1043 : // search key. Define the keys between a restart point and the next restart
1044 : // point as belonging to that restart point. Note that index could be equal to
1045 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
1046 : // increasing, then the restart point at index-1 will be the first one that
1047 : // has some keys belonging to it that are less than the search key.
1048 : //
1049 : // Next, we will search between the restart at index-1 and the restart point
1050 : // at index, for the first key >= key, and then on finding it, return
1051 : // i.Prev(). We need to know when we have hit the offset for index, since then
1052 : // we can stop searching. targetOffset encodes that offset for index.
1053 1 : targetOffset := i.restarts
1054 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1055 1 : if index < i.numRestarts {
1056 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
1057 1 :
1058 1 : if i.syntheticSuffix != nil {
1059 0 : // The binary search was conducted on keys without suffix replacement,
1060 0 : // implying the returned restart point (index) may be less than the search
1061 0 : // key, breaking the assumption described above.
1062 0 : //
1063 0 : // For example: consider this block with a replacement ts of 4, and
1064 0 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
1065 0 : // replacement: a@4,b@4,c@4
1066 0 : //
1067 0 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
1068 0 : //
1069 0 : // If the client calls SeekLT(b@3), the binary search would return b@2,
1070 0 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
1071 0 : // begin forward iteration from a@3, the previous restart point, to
1072 0 : // b{suffix}. The iteration stops when it encounters a key geq to the
1073 0 : // search key or if it reaches the upper bound. Without suffix
1074 0 : // replacement, we can assume that the upper bound of this forward
1075 0 : // iteration, b{suffix}, is greater than the search key, as implied by the
1076 0 : // binary search.
1077 0 : //
1078 0 : // If we naively hold this assumption with suffix replacement, the
1079 0 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
1080 0 : // incorrectly return a@4. To correct for this, if the original returned
1081 0 : // index is less than the search key, shift our forward iteration to begin
1082 0 : // at index instead of index -1. With suffix replacement the key at index
1083 0 : // is guaranteed to be the highest restart point less than the seach key
1084 0 : // (i.e. the same property of index-1 for a block without suffix
1085 0 : // replacement). This property holds because of the invariant that a block
1086 0 : // with suffix replacement will not have two keys that share the same
1087 0 : // prefix. To consider the above example, binary searching with b@3 landed
1088 0 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
1089 0 : // begin at b@4. We never need to shift by more than one restart point
1090 0 : // (i.e. to c@4) because it's impossible for the search key to be greater
1091 0 : // than the the key at the next restart point in the block because that
1092 0 : // key will always have a different prefix. Put another way, because no
1093 0 : // key in the block shares the same prefix, naive binary search should
1094 0 : // always land at most 1 restart point off the correct one.
1095 0 :
1096 0 : naiveOffset := i.offset
1097 0 : // Shift up to the original binary search result and decode the key.
1098 0 : i.offset = targetOffset
1099 0 : i.readEntry()
1100 0 : i.decodeInternalKey(i.key)
1101 0 : i.maybeReplaceSuffix(false /* allowInPlace */)
1102 0 :
1103 0 : // If the binary search point is actually less than the search key, post
1104 0 : // replacement, bump the target offset.
1105 0 : if i.cmp(i.ikey.UserKey, key) < 0 {
1106 0 : i.offset = targetOffset
1107 0 : if index+1 < i.numRestarts {
1108 0 : // if index+1 is within the i.data bounds, use it to find the target
1109 0 : // offset.
1110 0 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
1111 0 : } else {
1112 0 : targetOffset = i.restarts
1113 0 : }
1114 0 : } else {
1115 0 : i.offset = naiveOffset
1116 0 : }
1117 : }
1118 : }
1119 :
1120 : // Init nextOffset for the forward iteration below.
1121 1 : i.nextOffset = i.offset
1122 1 :
1123 1 : for {
1124 1 : i.offset = i.nextOffset
1125 1 : i.readEntry()
1126 1 : // When hidden keys are common, there is additional optimization possible
1127 1 : // by not caching entries that are hidden (note that some calls to
1128 1 : // cacheEntry don't decode the internal key before caching, but checking
1129 1 : // whether a key is hidden does not require full decoding). However, we do
1130 1 : // need to use the blockEntry.offset in the cache for the first entry at
1131 1 : // the reset point to do the binary search when the cache is empty -- so
1132 1 : // we would need to cache that first entry (though not the key) even if
1133 1 : // was hidden. Our current assumption is that if there are large numbers
1134 1 : // of hidden keys we will be able to skip whole blocks (using block
1135 1 : // property filters) so we don't bother optimizing.
1136 1 : hiddenPoint := i.decodeInternalKey(i.key)
1137 1 : i.maybeReplaceSuffix(false /*allowInPlace*/)
1138 1 :
1139 1 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
1140 1 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
1141 1 : // that we can reverse.
1142 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
1143 1 : // The current key is greater than or equal to our search key. Back up to
1144 1 : // the previous key which was less than our search key. Note that this for
1145 1 : // loop will execute at least once with this if-block not being true, so
1146 1 : // the key we are backing up to is the last one this loop cached.
1147 1 : return i.Prev()
1148 1 : }
1149 :
1150 1 : if i.nextOffset >= targetOffset {
1151 1 : // We've reached the end of the current restart block. Return the
1152 1 : // current key if not hidden, else call Prev().
1153 1 : //
1154 1 : // When the restart interval is 1, the first iteration of the for loop
1155 1 : // will bring us here. In that case ikey is backed by the block so we
1156 1 : // get the desired key stability guarantee for the lifetime of the
1157 1 : // blockIter. That is, we never cache anything and therefore never
1158 1 : // return a key backed by cachedBuf.
1159 1 : if hiddenPoint {
1160 1 : return i.Prev()
1161 1 : }
1162 1 : break
1163 : }
1164 1 : i.cacheEntry()
1165 : }
1166 :
1167 1 : if !i.valid() {
1168 1 : return nil, base.LazyValue{}
1169 1 : }
1170 1 : if !i.lazyValueHandling.hasValuePrefix ||
1171 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1172 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1173 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1174 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1175 1 : } else {
1176 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1177 1 : }
1178 1 : return &i.ikey, i.lazyValue
1179 : }
1180 :
1181 : // First implements internalIterator.First, as documented in the pebble
1182 : // package.
1183 1 : func (i *blockIter) First() (*InternalKey, base.LazyValue) {
1184 1 : if invariants.Enabled && i.isDataInvalidated() {
1185 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
1186 : }
1187 :
1188 1 : i.offset = 0
1189 1 : if !i.valid() {
1190 1 : return nil, base.LazyValue{}
1191 1 : }
1192 1 : i.clearCache()
1193 1 : i.readEntry()
1194 1 : hiddenPoint := i.decodeInternalKey(i.key)
1195 1 : if hiddenPoint {
1196 1 : return i.Next()
1197 1 : }
1198 1 : i.maybeReplaceSuffix(true /*allowInPlace*/)
1199 1 : if !i.lazyValueHandling.hasValuePrefix ||
1200 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1201 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1202 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1203 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1204 1 : } else {
1205 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1206 1 : }
1207 1 : return &i.ikey, i.lazyValue
1208 : }
1209 :
1210 1 : func decodeRestart(b []byte) int32 {
1211 1 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
1212 1 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
1213 1 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
1214 1 : }
1215 :
1216 : // Last implements internalIterator.Last, as documented in the pebble package.
1217 1 : func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
1218 1 : if invariants.Enabled && i.isDataInvalidated() {
1219 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
1220 : }
1221 :
1222 : // Seek forward from the last restart point.
1223 1 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
1224 1 : if !i.valid() {
1225 1 : return nil, base.LazyValue{}
1226 1 : }
1227 :
1228 1 : i.readEntry()
1229 1 : i.clearCache()
1230 1 :
1231 1 : for i.nextOffset < i.restarts {
1232 1 : i.cacheEntry()
1233 1 : i.offset = i.nextOffset
1234 1 : i.readEntry()
1235 1 : }
1236 :
1237 1 : hiddenPoint := i.decodeInternalKey(i.key)
1238 1 : if hiddenPoint {
1239 1 : return i.Prev()
1240 1 : }
1241 1 : i.maybeReplaceSuffix(false /*allowInPlace*/)
1242 1 : if !i.lazyValueHandling.hasValuePrefix ||
1243 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1244 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1245 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1246 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1247 1 : } else {
1248 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1249 1 : }
1250 1 : return &i.ikey, i.lazyValue
1251 : }
1252 :
1253 : // Next implements internalIterator.Next, as documented in the pebble
1254 : // package.
1255 1 : func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
1256 1 : if len(i.cachedBuf) > 0 {
1257 1 : // We're switching from reverse iteration to forward iteration. We need to
1258 1 : // populate i.fullKey with the current key we're positioned at so that
1259 1 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1260 1 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1261 1 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1262 1 : // copying into i.fullKey works for both cases.
1263 1 : //
1264 1 : // TODO(peter): Rather than clearing the cache, we could instead use the
1265 1 : // cache until it is exhausted. This would likely be faster than falling
1266 1 : // through to the normal forward iteration code below.
1267 1 : i.fullKey = append(i.fullKey[:0], i.key...)
1268 1 : i.clearCache()
1269 1 : }
1270 :
1271 : start:
1272 1 : i.offset = i.nextOffset
1273 1 : if !i.valid() {
1274 1 : return nil, base.LazyValue{}
1275 1 : }
1276 1 : i.readEntry()
1277 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1278 1 : if n := len(i.key) - 8; n >= 0 {
1279 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1280 1 : hiddenPoint := i.hideObsoletePoints &&
1281 1 : (trailer&trailerObsoleteBit != 0)
1282 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1283 1 : i.ikey.UserKey = i.key[:n:n]
1284 1 : if i.globalSeqNum != 0 {
1285 1 : i.ikey.SetSeqNum(i.globalSeqNum)
1286 1 : }
1287 1 : if hiddenPoint {
1288 1 : goto start
1289 : }
1290 1 : if i.syntheticSuffix != nil {
1291 1 : // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */)
1292 1 : prefixLen := i.split(i.ikey.UserKey)
1293 1 : if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
1294 1 : i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
1295 1 : } else {
1296 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1297 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
1298 1 : i.ikey.UserKey = i.synthSuffixBuf
1299 1 : }
1300 : }
1301 0 : } else {
1302 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1303 0 : i.ikey.UserKey = nil
1304 0 : }
1305 1 : if !i.lazyValueHandling.hasValuePrefix ||
1306 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1307 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1308 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1309 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1310 1 : } else {
1311 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1312 1 : }
1313 1 : return &i.ikey, i.lazyValue
1314 : }
1315 :
1316 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1317 1 : func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
1318 1 : if i.lazyValueHandling.hasValuePrefix {
1319 1 : return i.nextPrefixV3(succKey)
1320 1 : }
1321 1 : const nextsBeforeSeek = 3
1322 1 : k, v := i.Next()
1323 1 : for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
1324 1 : if j >= nextsBeforeSeek {
1325 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1326 1 : }
1327 1 : k, v = i.Next()
1328 : }
1329 1 : return k, v
1330 : }
1331 :
1332 1 : func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
1333 1 : // Doing nexts that involve a key comparison can be expensive (and the cost
1334 1 : // depends on the key length), so we use the same threshold of 3 that we use
1335 1 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1336 1 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1337 1 : // than doing a SeekGE within the block, so we do this 16 times
1338 1 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1339 1 : // search for the restart consumes > 100ns. If the number of versions is >
1340 1 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1341 1 : // on average need to find a key between two restarts, so another 8 steps
1342 1 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1343 1 : // steps.
1344 1 : //
1345 1 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1346 1 : // was written (which we don't currently store) instead of the default value
1347 1 : // of 16.
1348 1 : const nextCmpThresholdBeforeSeek = 3
1349 1 : const nextFastThresholdBeforeRestarts = 16
1350 1 : nextCmpCount := 0
1351 1 : nextFastCount := 0
1352 1 : usedRestarts := false
1353 1 : // INVARIANT: blockIter is valid.
1354 1 : if invariants.Enabled && !i.valid() {
1355 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1356 : }
1357 1 : prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
1358 1 : for {
1359 1 : i.offset = i.nextOffset
1360 1 : if !i.valid() {
1361 1 : return nil, base.LazyValue{}
1362 1 : }
1363 : // Need to decode the length integers, so we can compute nextOffset.
1364 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1365 1 : // This is an ugly performance hack. Reading entries from blocks is one of
1366 1 : // the inner-most routines and decoding the 3 varints per-entry takes
1367 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1368 1 : // us, so we do it manually. This provides a 10-15% performance improvement
1369 1 : // on blockIter benchmarks on both go1.11 and go1.12.
1370 1 : //
1371 1 : // TODO(peter): remove this hack if go:inline is ever supported.
1372 1 :
1373 1 : // Decode the shared key length integer.
1374 1 : var shared uint32
1375 1 : if a := *((*uint8)(ptr)); a < 128 {
1376 1 : shared = uint32(a)
1377 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1378 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1379 0 : shared = uint32(b)<<7 | uint32(a)
1380 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1381 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1382 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1383 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1384 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1385 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1386 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1387 0 : } else {
1388 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1389 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1390 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1391 0 : }
1392 : // Decode the unshared key length integer.
1393 1 : var unshared uint32
1394 1 : if a := *((*uint8)(ptr)); a < 128 {
1395 1 : unshared = uint32(a)
1396 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1397 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1398 0 : unshared = uint32(b)<<7 | uint32(a)
1399 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1400 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1401 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1402 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1403 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1404 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1405 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1406 0 : } else {
1407 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1408 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1409 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1410 0 : }
1411 : // Decode the value length integer.
1412 1 : var value uint32
1413 1 : if a := *((*uint8)(ptr)); a < 128 {
1414 1 : value = uint32(a)
1415 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1416 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1417 0 : value = uint32(b)<<7 | uint32(a)
1418 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1419 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1420 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1421 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1422 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1423 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1424 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1425 0 : } else {
1426 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1427 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1428 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1429 0 : }
1430 : // The starting position of the value.
1431 1 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1432 1 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1433 1 : if invariants.Enabled && unshared < 8 {
1434 0 : // This should not happen since only the key prefix is shared, so even
1435 0 : // if the prefix length is the same as the user key length, the unshared
1436 0 : // will include the trailer.
1437 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1438 : }
1439 : // The trailer is written in little endian, so the key kind is the first
1440 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1441 1 : keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1442 1 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1443 1 : prefixChanged := false
1444 1 : if keyKind == InternalKeyKindSet {
1445 1 : if invariants.Enabled && value == 0 {
1446 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1447 : }
1448 1 : valPrefix := *((*valuePrefix)(valuePtr))
1449 1 : if setHasSamePrefix(valPrefix) {
1450 1 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1451 1 : // that subsequent keys will not have a shared length that is greater
1452 1 : // than the prefix of the current key, which is also the prefix of
1453 1 : // i.key. Since we are continuing to iterate, we don't need to
1454 1 : // initialize i.ikey and i.lazyValue (these are initialized before
1455 1 : // returning).
1456 1 : nextFastCount++
1457 1 : if nextFastCount > nextFastThresholdBeforeRestarts {
1458 0 : if usedRestarts {
1459 0 : // Exhausted iteration budget. This will never happen unless
1460 0 : // someone is using a restart interval > 16. It is just to guard
1461 0 : // against long restart intervals causing too much iteration.
1462 0 : break
1463 : }
1464 : // Haven't used restarts yet, so find the first restart at or beyond
1465 : // the current offset.
1466 0 : targetOffset := i.offset
1467 0 : var index int32
1468 0 : {
1469 0 : // NB: manually inlined sort.Sort is ~5% faster.
1470 0 : //
1471 0 : // f defined for a restart point is true iff the offset >=
1472 0 : // targetOffset.
1473 0 : // Define f(-1) == false and f(i.numRestarts) == true.
1474 0 : // Invariant: f(index-1) == false, f(upper) == true.
1475 0 : upper := i.numRestarts
1476 0 : for index < upper {
1477 0 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1478 0 : // index ≤ h < upper
1479 0 : offset := decodeRestart(i.data[i.restarts+4*h:])
1480 0 : if offset < targetOffset {
1481 0 : index = h + 1 // preserves f(index-1) == false
1482 0 : } else {
1483 0 : upper = h // preserves f(upper) == true
1484 0 : }
1485 : }
1486 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1487 : // => answer is index.
1488 : }
1489 0 : usedRestarts = true
1490 0 : nextFastCount = 0
1491 0 : if index == i.numRestarts {
1492 0 : // Already past the last real restart, so iterate a bit more until
1493 0 : // we are done with the block.
1494 0 : continue
1495 : }
1496 : // Have some real restarts after index. NB: index is the first
1497 : // restart at or beyond the current offset.
1498 0 : startingIndex := index
1499 0 : for index != i.numRestarts &&
1500 0 : // The restart at index is 4 bytes written in little endian format
1501 0 : // starting at i.restart+4*index. The 0th byte is the least
1502 0 : // significant and the 3rd byte is the most significant. Since the
1503 0 : // most significant bit of the 3rd byte is what we use for
1504 0 : // encoding the set-has-same-prefix information, the indexing
1505 0 : // below has +3.
1506 0 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1507 0 : // We still have the same prefix, so move to the next restart.
1508 0 : index++
1509 0 : }
1510 : // index is the first restart that did not have the same prefix.
1511 0 : if index != startingIndex {
1512 0 : // Managed to skip past at least one restart. Resume iteration
1513 0 : // from index-1. Since nextFastCount has been reset to 0, we
1514 0 : // should be able to iterate to the next prefix.
1515 0 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1516 0 : i.readEntry()
1517 0 : }
1518 : // Else, unable to skip past any restart. Resume iteration. Since
1519 : // nextFastCount has been reset to 0, we should be able to iterate
1520 : // to the next prefix.
1521 0 : continue
1522 : }
1523 1 : continue
1524 1 : } else if prevKeyIsSet {
1525 1 : prefixChanged = true
1526 1 : }
1527 1 : } else {
1528 1 : prevKeyIsSet = false
1529 1 : }
1530 : // Slow-path cases:
1531 : // - (Likely) The prefix has changed.
1532 : // - (Unlikely) The prefix has not changed.
1533 : // We assemble the key etc. under the assumption that it is the likely
1534 : // case.
1535 1 : unsharedKey := getBytes(ptr, int(unshared))
1536 1 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1537 1 : // since the current logic assumes we have always copied the latest key
1538 1 : // into fullKey, which is why when we get to the next key we can (a)
1539 1 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1540 1 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1541 1 : // valid (even if unshared). For (b), we will need to remember whether
1542 1 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1543 1 : // in the former case and for the latter case need to copy the shared part
1544 1 : // too. This same comment applies to the other place where we can do this
1545 1 : // optimization, in readEntry().
1546 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1547 1 : i.val = getBytes(valuePtr, int(value))
1548 1 : if shared == 0 {
1549 1 : // Provide stability for the key across positioning calls if the key
1550 1 : // doesn't share a prefix with the previous key. This removes requiring the
1551 1 : // key to be copied if the caller knows the block has a restart interval of
1552 1 : // 1. An important example of this is range-del blocks.
1553 1 : i.key = unsharedKey
1554 1 : } else {
1555 1 : i.key = i.fullKey
1556 1 : }
1557 : // Manually inlined version of i.decodeInternalKey(i.key).
1558 1 : hiddenPoint := false
1559 1 : if n := len(i.key) - 8; n >= 0 {
1560 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1561 1 : hiddenPoint = i.hideObsoletePoints &&
1562 1 : (trailer&trailerObsoleteBit != 0)
1563 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1564 1 : i.ikey.UserKey = i.key[:n:n]
1565 1 : if i.globalSeqNum != 0 {
1566 0 : i.ikey.SetSeqNum(i.globalSeqNum)
1567 0 : }
1568 1 : if i.syntheticSuffix != nil {
1569 0 : // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */)
1570 0 : prefixLen := i.split(i.ikey.UserKey)
1571 0 : if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) {
1572 0 : i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...)
1573 0 : } else {
1574 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1575 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
1576 0 : i.ikey.UserKey = i.synthSuffixBuf
1577 0 : }
1578 : }
1579 0 : } else {
1580 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1581 0 : i.ikey.UserKey = nil
1582 0 : }
1583 1 : nextCmpCount++
1584 1 : if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
1585 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1586 0 : i.ikey.UserKey, succKey))
1587 : }
1588 1 : if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
1589 1 : // Prefix has changed.
1590 1 : if hiddenPoint {
1591 1 : return i.Next()
1592 1 : }
1593 1 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1594 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1595 : }
1596 1 : if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1597 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1598 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1599 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1600 1 : } else {
1601 0 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1602 0 : }
1603 1 : return &i.ikey, i.lazyValue
1604 : }
1605 : // Else prefix has not changed.
1606 :
1607 1 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1608 1 : break
1609 : }
1610 : }
1611 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1612 : }
1613 :
1614 : // Prev implements internalIterator.Prev, as documented in the pebble
1615 : // package.
1616 1 : func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
1617 1 : start:
1618 1 : for n := len(i.cached) - 1; n >= 0; n-- {
1619 1 : i.nextOffset = i.offset
1620 1 : e := &i.cached[n]
1621 1 : i.offset = e.offset
1622 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1623 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1624 1 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1625 1 : if n := len(i.key) - 8; n >= 0 {
1626 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1627 1 : hiddenPoint := i.hideObsoletePoints &&
1628 1 : (trailer&trailerObsoleteBit != 0)
1629 1 : if hiddenPoint {
1630 1 : continue
1631 : }
1632 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1633 1 : i.ikey.UserKey = i.key[:n:n]
1634 1 : if i.globalSeqNum != 0 {
1635 1 : i.ikey.SetSeqNum(i.globalSeqNum)
1636 1 : }
1637 1 : if i.syntheticSuffix != nil {
1638 1 : // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */)
1639 1 : prefixLen := i.split(i.ikey.UserKey)
1640 1 : // If ikey is cached or may get cached, we must de-reference
1641 1 : // UserKey before prefix replacement.
1642 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1643 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
1644 1 : i.ikey.UserKey = i.synthSuffixBuf
1645 1 : }
1646 0 : } else {
1647 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1648 0 : i.ikey.UserKey = nil
1649 0 : }
1650 1 : i.cached = i.cached[:n]
1651 1 : if !i.lazyValueHandling.hasValuePrefix ||
1652 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1653 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1654 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1655 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1656 1 : } else {
1657 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1658 1 : }
1659 1 : return &i.ikey, i.lazyValue
1660 : }
1661 :
1662 1 : i.clearCache()
1663 1 : if i.offset <= 0 {
1664 1 : i.offset = -1
1665 1 : i.nextOffset = 0
1666 1 : return nil, base.LazyValue{}
1667 1 : }
1668 :
1669 1 : targetOffset := i.offset
1670 1 : var index int32
1671 1 :
1672 1 : {
1673 1 : // NB: manually inlined sort.Sort is ~5% faster.
1674 1 : //
1675 1 : // Define f(-1) == false and f(n) == true.
1676 1 : // Invariant: f(index-1) == false, f(upper) == true.
1677 1 : upper := i.numRestarts
1678 1 : for index < upper {
1679 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1680 1 : // index ≤ h < upper
1681 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1682 1 : if offset < targetOffset {
1683 1 : // Looking for the first restart that has offset >= targetOffset, so
1684 1 : // ignore h and earlier.
1685 1 : index = h + 1 // preserves f(i-1) == false
1686 1 : } else {
1687 1 : upper = h // preserves f(j) == true
1688 1 : }
1689 : }
1690 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1691 : // => answer is index.
1692 : }
1693 :
1694 : // index is first restart with offset >= targetOffset. Note that
1695 : // targetOffset may not be at a restart point since one can call Prev()
1696 : // after Next() (so the cache was not populated) and targetOffset refers to
1697 : // the current entry. index-1 must have an offset < targetOffset (it can't
1698 : // be equal to targetOffset since the binary search would have selected that
1699 : // as the index).
1700 1 : i.offset = 0
1701 1 : if index > 0 {
1702 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1703 1 : }
1704 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1705 : // valid offset.
1706 :
1707 1 : i.readEntry()
1708 1 :
1709 1 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1710 1 : // entry we are stepping back from, and we don't need to cache the entry
1711 1 : // before it, since it is the candidate to return.
1712 1 : for i.nextOffset < targetOffset {
1713 1 : i.cacheEntry()
1714 1 : i.offset = i.nextOffset
1715 1 : i.readEntry()
1716 1 : }
1717 :
1718 1 : hiddenPoint := i.decodeInternalKey(i.key)
1719 1 : if hiddenPoint {
1720 1 : // Use the cache.
1721 1 : goto start
1722 : }
1723 1 : if i.syntheticSuffix != nil {
1724 1 : // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */)
1725 1 : prefixLen := i.split(i.ikey.UserKey)
1726 1 : // If ikey is cached or may get cached, we must de-reference
1727 1 : // UserKey before prefix replacement.
1728 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1729 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...)
1730 1 : i.ikey.UserKey = i.synthSuffixBuf
1731 1 : }
1732 1 : if !i.lazyValueHandling.hasValuePrefix ||
1733 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1734 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1735 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1736 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1737 1 : } else {
1738 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1739 1 : }
1740 1 : return &i.ikey, i.lazyValue
1741 : }
1742 :
1743 : // Key implements internalIterator.Key, as documented in the pebble package.
1744 1 : func (i *blockIter) Key() *InternalKey {
1745 1 : return &i.ikey
1746 1 : }
1747 :
1748 1 : func (i *blockIter) value() base.LazyValue {
1749 1 : return i.lazyValue
1750 1 : }
1751 :
1752 : // Error implements internalIterator.Error, as documented in the pebble
1753 : // package.
1754 1 : func (i *blockIter) Error() error {
1755 1 : return nil // infallible
1756 1 : }
1757 :
1758 : // Close implements internalIterator.Close, as documented in the pebble
1759 : // package.
1760 1 : func (i *blockIter) Close() error {
1761 1 : i.handle.Release()
1762 1 : i.handle = bufferHandle{}
1763 1 : i.val = nil
1764 1 : i.lazyValue = base.LazyValue{}
1765 1 : i.lazyValueHandling.vbr = nil
1766 1 : return nil
1767 1 : }
1768 :
1769 0 : func (i *blockIter) SetBounds(lower, upper []byte) {
1770 0 : // This should never be called as bounds are handled by sstable.Iterator.
1771 0 : panic("pebble: SetBounds unimplemented")
1772 : }
1773 :
1774 0 : func (i *blockIter) SetContext(_ context.Context) {}
1775 :
1776 1 : func (i *blockIter) valid() bool {
1777 1 : return i.offset >= 0 && i.offset < i.restarts
1778 1 : }
1779 :
1780 : // fragmentBlockIter wraps a blockIter, implementing the
1781 : // keyspan.FragmentIterator interface. It's used for reading range deletion and
1782 : // range key blocks.
1783 : //
1784 : // Range deletions and range keys are fragmented before they're persisted to the
1785 : // block. Overlapping fragments have identical bounds. The fragmentBlockIter
1786 : // gathers all the fragments with identical bounds within a block and returns a
1787 : // single keyspan.Span describing all the keys defined over the span.
1788 : //
1789 : // # Memory lifetime
1790 : //
1791 : // A Span returned by fragmentBlockIter is only guaranteed to be stable until
1792 : // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
1793 : // may be reused, so the user must not assume it's stable.
1794 : //
1795 : // Blocks holding range deletions and range keys are configured to use a restart
1796 : // interval of 1. This provides key stability. The caller may treat the various
1797 : // byte slices (start, end, suffix, value) as stable for the lifetime of the
1798 : // iterator.
1799 : type fragmentBlockIter struct {
1800 : blockIter blockIter
1801 : keyBuf [2]keyspan.Key
1802 : span keyspan.Span
1803 : dir int8
1804 : closeHook func(i keyspan.FragmentIterator) error
1805 :
1806 : // elideSameSeqnum, if true, returns only the first-occurring (in forward
1807 : // order) Key for each sequence number.
1808 : elideSameSeqnum bool
1809 : }
1810 :
1811 1 : func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
1812 1 : return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
1813 1 : }
1814 :
1815 1 : func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) error {
1816 1 : // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
1817 1 : // calls to Decode is too confusing and subtle. Refactor to make it
1818 1 : // explicit.
1819 1 :
1820 1 : // decode the contents of the fragment's value. This always includes at
1821 1 : // least the end key: RANGEDELs store the end key directly as the value,
1822 1 : // whereas the various range key kinds store are more complicated. The
1823 1 : // details of the range key internal value format are documented within the
1824 1 : // internal/rangekey package.
1825 1 : var err error
1826 1 : switch k.Kind() {
1827 1 : case base.InternalKeyKindRangeDelete:
1828 1 : i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
1829 1 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
1830 1 : i.span, err = rangekey.Decode(*k, internalValue, i.span.Keys)
1831 0 : default:
1832 0 : i.span = keyspan.Span{}
1833 0 : err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
1834 : }
1835 1 : return err
1836 : }
1837 :
1838 1 : func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
1839 1 : if invariants.Enabled {
1840 1 : if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
1841 0 : panic("elideKeysOfSameSeqNum called when it should not be")
1842 : }
1843 : }
1844 1 : lastSeqNum := i.span.Keys[0].SeqNum()
1845 1 : k := 1
1846 1 : for j := 1; j < len(i.span.Keys); j++ {
1847 1 : if lastSeqNum != i.span.Keys[j].SeqNum() {
1848 1 : lastSeqNum = i.span.Keys[j].SeqNum()
1849 1 : i.span.Keys[k] = i.span.Keys[j]
1850 1 : k++
1851 1 : }
1852 : }
1853 1 : i.span.Keys = i.span.Keys[:k]
1854 : }
1855 :
1856 : // gatherForward gathers internal keys with identical bounds. Keys defined over
1857 : // spans of the keyspace are fragmented such that any overlapping key spans have
1858 : // identical bounds. When these spans are persisted to a range deletion or range
1859 : // key block, they may be persisted as multiple internal keys in order to encode
1860 : // multiple sequence numbers or key kinds.
1861 : //
1862 : // gatherForward iterates forward, re-combining the fragmented internal keys to
1863 : // reconstruct a keyspan.Span that holds all the keys defined over the span.
1864 : func (i *fragmentBlockIter) gatherForward(
1865 : k *InternalKey, lazyValue base.LazyValue,
1866 1 : ) (*keyspan.Span, error) {
1867 1 : i.span = keyspan.Span{}
1868 1 : if k == nil || !i.blockIter.valid() {
1869 1 : return nil, nil
1870 1 : }
1871 : // Use the i.keyBuf array to back the Keys slice to prevent an allocation
1872 : // when a span contains few keys.
1873 1 : i.span.Keys = i.keyBuf[:0]
1874 1 :
1875 1 : // Decode the span's end key and individual keys from the value.
1876 1 : internalValue := lazyValue.InPlaceValue()
1877 1 : if err := i.decodeSpanKeys(k, internalValue); err != nil {
1878 0 : return nil, err
1879 0 : }
1880 1 : prevEnd := i.span.End
1881 1 :
1882 1 : // There might exist additional internal keys with identical bounds encoded
1883 1 : // within the block. Iterate forward, accumulating all the keys with
1884 1 : // identical bounds to s.
1885 1 : k, lazyValue = i.blockIter.Next()
1886 1 : internalValue = lazyValue.InPlaceValue()
1887 1 : for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
1888 1 : if err := i.decodeSpanKeys(k, internalValue); err != nil {
1889 0 : return nil, err
1890 0 : }
1891 :
1892 : // Since k indicates an equal start key, the encoded end key must
1893 : // exactly equal the original end key from the first internal key.
1894 : // Overlapping fragments are required to have exactly equal start and
1895 : // end bounds.
1896 1 : if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
1897 0 : i.span = keyspan.Span{}
1898 0 : return nil, base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
1899 0 : }
1900 1 : k, lazyValue = i.blockIter.Next()
1901 1 : internalValue = lazyValue.InPlaceValue()
1902 : }
1903 1 : if i.elideSameSeqnum && len(i.span.Keys) > 0 {
1904 1 : i.elideKeysOfSameSeqNum()
1905 1 : }
1906 : // i.blockIter is positioned over the first internal key for the next span.
1907 1 : return &i.span, nil
1908 : }
1909 :
1910 : // gatherBackward gathers internal keys with identical bounds. Keys defined over
1911 : // spans of the keyspace are fragmented such that any overlapping key spans have
1912 : // identical bounds. When these spans are persisted to a range deletion or range
1913 : // key block, they may be persisted as multiple internal keys in order to encode
1914 : // multiple sequence numbers or key kinds.
1915 : //
1916 : // gatherBackward iterates backwards, re-combining the fragmented internal keys
1917 : // to reconstruct a keyspan.Span that holds all the keys defined over the span.
1918 : func (i *fragmentBlockIter) gatherBackward(
1919 : k *InternalKey, lazyValue base.LazyValue,
1920 1 : ) (*keyspan.Span, error) {
1921 1 : i.span = keyspan.Span{}
1922 1 : if k == nil || !i.blockIter.valid() {
1923 1 : return nil, nil
1924 1 : }
1925 : // Use the i.keyBuf array to back the Keys slice to prevent an allocation
1926 : // when a span contains few keys.
1927 1 : i.span.Keys = i.keyBuf[:0]
1928 1 :
1929 1 : // Decode the span's end key and individual keys from the value.
1930 1 : internalValue := lazyValue.InPlaceValue()
1931 1 : if err := i.decodeSpanKeys(k, internalValue); err != nil {
1932 0 : return nil, err
1933 0 : }
1934 1 : prevEnd := i.span.End
1935 1 :
1936 1 : // There might exist additional internal keys with identical bounds encoded
1937 1 : // within the block. Iterate backward, accumulating all the keys with
1938 1 : // identical bounds to s.
1939 1 : k, lazyValue = i.blockIter.Prev()
1940 1 : internalValue = lazyValue.InPlaceValue()
1941 1 : for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
1942 1 : if err := i.decodeSpanKeys(k, internalValue); err != nil {
1943 0 : return nil, err
1944 0 : }
1945 :
1946 : // Since k indicates an equal start key, the encoded end key must
1947 : // exactly equal the original end key from the first internal key.
1948 : // Overlapping fragments are required to have exactly equal start and
1949 : // end bounds.
1950 1 : if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
1951 0 : i.span = keyspan.Span{}
1952 0 : return nil, base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
1953 0 : }
1954 1 : k, lazyValue = i.blockIter.Prev()
1955 1 : internalValue = lazyValue.InPlaceValue()
1956 : }
1957 : // i.blockIter is positioned over the last internal key for the previous
1958 : // span.
1959 :
1960 : // Backwards iteration encounters internal keys in the wrong order.
1961 1 : keyspan.SortKeysByTrailer(&i.span.Keys)
1962 1 :
1963 1 : if i.elideSameSeqnum && len(i.span.Keys) > 0 {
1964 1 : i.elideKeysOfSameSeqNum()
1965 1 : }
1966 1 : return &i.span, nil
1967 : }
1968 :
1969 : // Close implements (keyspan.FragmentIterator).Close.
1970 1 : func (i *fragmentBlockIter) Close() error {
1971 1 : var err error
1972 1 : if i.closeHook != nil {
1973 0 : err = i.closeHook(i)
1974 0 : }
1975 1 : err = firstError(err, i.blockIter.Close())
1976 1 : return err
1977 : }
1978 :
1979 : // First implements (keyspan.FragmentIterator).First
1980 1 : func (i *fragmentBlockIter) First() (*keyspan.Span, error) {
1981 1 : i.dir = +1
1982 1 : return i.gatherForward(i.blockIter.First())
1983 1 : }
1984 :
1985 : // Last implements (keyspan.FragmentIterator).Last.
1986 1 : func (i *fragmentBlockIter) Last() (*keyspan.Span, error) {
1987 1 : i.dir = -1
1988 1 : return i.gatherBackward(i.blockIter.Last())
1989 1 : }
1990 :
1991 : // Next implements (keyspan.FragmentIterator).Next.
1992 1 : func (i *fragmentBlockIter) Next() (*keyspan.Span, error) {
1993 1 : switch {
1994 1 : case i.dir == -1 && !i.span.Valid():
1995 1 : // Switching directions.
1996 1 : //
1997 1 : // i.blockIter is exhausted, before the first key. Move onto the first.
1998 1 : i.blockIter.First()
1999 1 : i.dir = +1
2000 1 : case i.dir == -1 && i.span.Valid():
2001 1 : // Switching directions.
2002 1 : //
2003 1 : // i.blockIter is currently positioned over the last internal key for
2004 1 : // the previous span. Next it once to move to the first internal key
2005 1 : // that makes up the current span, and gatherForwaad to land on the
2006 1 : // first internal key making up the next span.
2007 1 : //
2008 1 : // In the diagram below, if the last span returned to the user during
2009 1 : // reverse iteration was [b,c), i.blockIter is currently positioned at
2010 1 : // [a,b). The block iter must be positioned over [d,e) to gather the
2011 1 : // next span's fragments.
2012 1 : //
2013 1 : // ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
2014 1 : // ^ ^
2015 1 : // i.blockIter want
2016 1 : if x, err := i.gatherForward(i.blockIter.Next()); err != nil {
2017 0 : return nil, err
2018 1 : } else if invariants.Enabled && !x.Valid() {
2019 0 : panic("pebble: invariant violation: next entry unexpectedly invalid")
2020 : }
2021 1 : i.dir = +1
2022 : }
2023 : // We know that this blockIter has in-place values.
2024 1 : return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
2025 : }
2026 :
2027 : // Prev implements (keyspan.FragmentIterator).Prev.
2028 1 : func (i *fragmentBlockIter) Prev() (*keyspan.Span, error) {
2029 1 : switch {
2030 1 : case i.dir == +1 && !i.span.Valid():
2031 1 : // Switching directions.
2032 1 : //
2033 1 : // i.blockIter is exhausted, after the last key. Move onto the last.
2034 1 : i.blockIter.Last()
2035 1 : i.dir = -1
2036 1 : case i.dir == +1 && i.span.Valid():
2037 1 : // Switching directions.
2038 1 : //
2039 1 : // i.blockIter is currently positioned over the first internal key for
2040 1 : // the next span. Prev it once to move to the last internal key that
2041 1 : // makes up the current span, and gatherBackward to land on the last
2042 1 : // internal key making up the previous span.
2043 1 : //
2044 1 : // In the diagram below, if the last span returned to the user during
2045 1 : // forward iteration was [b,c), i.blockIter is currently positioned at
2046 1 : // [d,e). The block iter must be positioned over [a,b) to gather the
2047 1 : // previous span's fragments.
2048 1 : //
2049 1 : // ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
2050 1 : // ^ ^
2051 1 : // want i.blockIter
2052 1 : if x, err := i.gatherBackward(i.blockIter.Prev()); err != nil {
2053 0 : return nil, err
2054 1 : } else if invariants.Enabled && !x.Valid() {
2055 0 : panic("pebble: invariant violation: previous entry unexpectedly invalid")
2056 : }
2057 1 : i.dir = -1
2058 : }
2059 : // We know that this blockIter has in-place values.
2060 1 : return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
2061 : }
2062 :
2063 : // SeekGE implements (keyspan.FragmentIterator).SeekGE.
2064 1 : func (i *fragmentBlockIter) SeekGE(k []byte) (*keyspan.Span, error) {
2065 1 : if s, err := i.SeekLT(k); err != nil {
2066 0 : return nil, err
2067 1 : } else if s != nil && i.blockIter.cmp(k, s.End) < 0 {
2068 1 : return s, nil
2069 1 : }
2070 : // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
2071 : // doesn't meet the k < s.End comparison, then there's no need for the
2072 : // SeekLT to gatherBackward.
2073 1 : return i.Next()
2074 : }
2075 :
2076 : // SeekLT implements (keyspan.FragmentIterator).SeekLT.
2077 1 : func (i *fragmentBlockIter) SeekLT(k []byte) (*keyspan.Span, error) {
2078 1 : i.dir = -1
2079 1 : return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
2080 1 : }
2081 :
2082 : // String implements fmt.Stringer.
2083 0 : func (i *fragmentBlockIter) String() string {
2084 0 : return "fragment-block-iter"
2085 0 : }
2086 :
2087 : // SetCloseHook implements sstable.FragmentIterator.
2088 0 : func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
2089 0 : i.closeHook = fn
2090 0 : }
2091 :
2092 : // WrapChildren implements FragmentIterator.
2093 0 : func (i *fragmentBlockIter) WrapChildren(wrap keyspan.WrapFn) {}
|