Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "encoding/binary"
9 : "unsafe"
10 :
11 : "github.com/cockroachdb/errors"
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : "github.com/cockroachdb/pebble/internal/keyspan"
15 : "github.com/cockroachdb/pebble/internal/manual"
16 : "github.com/cockroachdb/pebble/internal/rangedel"
17 : "github.com/cockroachdb/pebble/internal/rangekey"
18 : )
19 :
20 1 : func uvarintLen(v uint32) int {
21 1 : i := 0
22 1 : for v >= 0x80 {
23 0 : v >>= 7
24 0 : i++
25 0 : }
26 1 : return i + 1
27 : }
28 :
29 : type blockWriter struct {
30 : restartInterval int
31 : nEntries int
32 : nextRestart int
33 : buf []byte
34 : // For datablocks in TableFormatPebblev3, we steal the most significant bit
35 : // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
36 : // us with 31 bits, which is more than enough (no one needs > 2GB blocks).
37 : // Typically, restarts occur every 16 keys, and by storing this bit with the
38 : // restart, we can optimize for the case where a user wants to skip to the
39 : // next prefix which happens to be in the same data block, but is > 16 keys
40 : // away. We have seen production situations with 100+ versions per MVCC key
41 : // (which share the same prefix). Additionally, for such writers, the prefix
42 : // compression of the key, that shares the key with the preceding key, is
43 : // limited to the prefix part of the preceding key -- this ensures that when
44 : // doing NPrefix (see blockIter) we don't need to assemble the full key
45 : // for each step since by limiting the length of the shared key we are
46 : // ensuring that any of the keys with the same prefix can be used to
47 : // assemble the full key when the prefix does change.
48 : restarts []uint32
49 : // Do not read curKey directly from outside blockWriter since it can have
50 : // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
51 : // getCurUserKey() instead.
52 : curKey []byte
53 : // curValue excludes the optional prefix provided to
54 : // storeWithOptionalValuePrefix.
55 : curValue []byte
56 : prevKey []byte
57 : tmp [4]byte
58 : // We don't know the state of the sets that were at the end of the previous
59 : // block, so this is initially 0. It may be true for the second and later
60 : // restarts in a block. Not having inter-block information is fine since we
61 : // will optimize by stepping through restarts only within the same block.
62 : // Note that the first restart is the first key in the block.
63 : setHasSameKeyPrefixSinceLastRestart bool
64 : }
65 :
66 1 : func (w *blockWriter) clear() {
67 1 : *w = blockWriter{
68 1 : buf: w.buf[:0],
69 1 : restarts: w.restarts[:0],
70 1 : curKey: w.curKey[:0],
71 1 : curValue: w.curValue[:0],
72 1 : prevKey: w.prevKey[:0],
73 1 : }
74 1 : }
75 :
76 : // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
77 : // explicitly place this limit to reserve a few bits in the restart for
78 : // internal use.
79 : const MaximumBlockSize = 1 << 28
80 : const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
81 : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
82 : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
83 :
84 1 : func (w *blockWriter) getCurKey() InternalKey {
85 1 : k := base.DecodeInternalKey(w.curKey)
86 1 : k.Trailer = k.Trailer & trailerObsoleteMask
87 1 : return k
88 1 : }
89 :
90 1 : func (w *blockWriter) getCurUserKey() []byte {
91 1 : n := len(w.curKey) - base.InternalTrailerLen
92 1 : if n < 0 {
93 0 : panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
94 : }
95 1 : return w.curKey[:n:n]
96 : }
97 :
98 : // If !addValuePrefix, the valuePrefix is ignored.
99 : func (w *blockWriter) storeWithOptionalValuePrefix(
100 : keySize int,
101 : value []byte,
102 : maxSharedKeyLen int,
103 : addValuePrefix bool,
104 : valuePrefix valuePrefix,
105 : setHasSameKeyPrefix bool,
106 1 : ) {
107 1 : shared := 0
108 1 : if !setHasSameKeyPrefix {
109 1 : w.setHasSameKeyPrefixSinceLastRestart = false
110 1 : }
111 1 : if w.nEntries == w.nextRestart {
112 1 : w.nextRestart = w.nEntries + w.restartInterval
113 1 : restart := uint32(len(w.buf))
114 1 : if w.setHasSameKeyPrefixSinceLastRestart {
115 1 : restart = restart | setHasSameKeyPrefixRestartMask
116 1 : }
117 1 : w.setHasSameKeyPrefixSinceLastRestart = true
118 1 : w.restarts = append(w.restarts, restart)
119 1 : } else {
120 1 : // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
121 1 : // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
122 1 : // show this to not be a performance win. For now, functions that use of
123 1 : // unsafe cannot be inlined.
124 1 : n := maxSharedKeyLen
125 1 : if n > len(w.prevKey) {
126 1 : n = len(w.prevKey)
127 1 : }
128 1 : asUint64 := func(b []byte, i int) uint64 {
129 1 : return binary.LittleEndian.Uint64(b[i:])
130 1 : }
131 1 : for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
132 1 : shared += 8
133 1 : }
134 1 : for shared < n && w.curKey[shared] == w.prevKey[shared] {
135 1 : shared++
136 1 : }
137 : }
138 :
139 1 : lenValuePlusOptionalPrefix := len(value)
140 1 : if addValuePrefix {
141 1 : lenValuePlusOptionalPrefix++
142 1 : }
143 1 : needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
144 1 : n := len(w.buf)
145 1 : if cap(w.buf) < n+needed {
146 1 : newCap := 2 * cap(w.buf)
147 1 : if newCap == 0 {
148 1 : newCap = 1024
149 1 : }
150 1 : for newCap < n+needed {
151 1 : newCap *= 2
152 1 : }
153 1 : newBuf := make([]byte, n, newCap)
154 1 : copy(newBuf, w.buf)
155 1 : w.buf = newBuf
156 : }
157 1 : w.buf = w.buf[:n+needed]
158 1 :
159 1 : // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
160 1 : // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
161 1 : // show this to not be a performance win.
162 1 : {
163 1 : x := uint32(shared)
164 1 : for x >= 0x80 {
165 0 : w.buf[n] = byte(x) | 0x80
166 0 : x >>= 7
167 0 : n++
168 0 : }
169 1 : w.buf[n] = byte(x)
170 1 : n++
171 : }
172 :
173 1 : {
174 1 : x := uint32(keySize - shared)
175 1 : for x >= 0x80 {
176 0 : w.buf[n] = byte(x) | 0x80
177 0 : x >>= 7
178 0 : n++
179 0 : }
180 1 : w.buf[n] = byte(x)
181 1 : n++
182 : }
183 :
184 1 : {
185 1 : x := uint32(lenValuePlusOptionalPrefix)
186 1 : for x >= 0x80 {
187 0 : w.buf[n] = byte(x) | 0x80
188 0 : x >>= 7
189 0 : n++
190 0 : }
191 1 : w.buf[n] = byte(x)
192 1 : n++
193 : }
194 :
195 1 : n += copy(w.buf[n:], w.curKey[shared:])
196 1 : if addValuePrefix {
197 1 : w.buf[n : n+1][0] = byte(valuePrefix)
198 1 : n++
199 1 : }
200 1 : n += copy(w.buf[n:], value)
201 1 : w.buf = w.buf[:n]
202 1 :
203 1 : w.curValue = w.buf[n-len(value):]
204 1 :
205 1 : w.nEntries++
206 : }
207 :
208 1 : func (w *blockWriter) add(key InternalKey, value []byte) {
209 1 : w.addWithOptionalValuePrefix(
210 1 : key, false, value, len(key.UserKey), false, 0, false)
211 1 : }
212 :
213 : // Callers that always set addValuePrefix to false should use add() instead.
214 : //
215 : // isObsolete indicates whether this key-value pair is obsolete in this
216 : // sstable (only applicable when writing data blocks) -- see the comment in
217 : // table.go and the longer one in format.go. addValuePrefix adds a 1 byte
218 : // prefix to the value, specified in valuePrefix -- this is used for data
219 : // blocks in TableFormatPebblev3 onwards for SETs (see the comment in
220 : // format.go, with more details in value_block.go). setHasSameKeyPrefix is
221 : // also used in TableFormatPebblev3 onwards for SETs.
222 : func (w *blockWriter) addWithOptionalValuePrefix(
223 : key InternalKey,
224 : isObsolete bool,
225 : value []byte,
226 : maxSharedKeyLen int,
227 : addValuePrefix bool,
228 : valuePrefix valuePrefix,
229 : setHasSameKeyPrefix bool,
230 1 : ) {
231 1 : w.curKey, w.prevKey = w.prevKey, w.curKey
232 1 :
233 1 : size := key.Size()
234 1 : if cap(w.curKey) < size {
235 1 : w.curKey = make([]byte, 0, size*2)
236 1 : }
237 1 : w.curKey = w.curKey[:size]
238 1 : if isObsolete {
239 1 : key.Trailer = key.Trailer | trailerObsoleteBit
240 1 : }
241 1 : key.Encode(w.curKey)
242 1 :
243 1 : w.storeWithOptionalValuePrefix(
244 1 : size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
245 : }
246 :
247 1 : func (w *blockWriter) finish() []byte {
248 1 : // Write the restart points to the buffer.
249 1 : if w.nEntries == 0 {
250 1 : // Every block must have at least one restart point.
251 1 : if cap(w.restarts) > 0 {
252 1 : w.restarts = w.restarts[:1]
253 1 : w.restarts[0] = 0
254 1 : } else {
255 1 : w.restarts = append(w.restarts, 0)
256 1 : }
257 : }
258 1 : tmp4 := w.tmp[:4]
259 1 : for _, x := range w.restarts {
260 1 : binary.LittleEndian.PutUint32(tmp4, x)
261 1 : w.buf = append(w.buf, tmp4...)
262 1 : }
263 1 : binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
264 1 : w.buf = append(w.buf, tmp4...)
265 1 : result := w.buf
266 1 :
267 1 : // Reset the block state.
268 1 : w.nEntries = 0
269 1 : w.nextRestart = 0
270 1 : w.buf = w.buf[:0]
271 1 : w.restarts = w.restarts[:0]
272 1 : return result
273 : }
274 :
275 : // emptyBlockSize holds the size of an empty block. Every block ends
276 : // in a uint32 trailer encoding the number of restart points within the
277 : // block.
278 : const emptyBlockSize = 4
279 :
280 1 : func (w *blockWriter) estimatedSize() int {
281 1 : return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
282 1 : }
283 :
284 : type blockEntry struct {
285 : offset int32
286 : keyStart int32
287 : keyEnd int32
288 : valStart int32
289 : valSize int32
290 : }
291 :
292 : // blockIter is an iterator over a single block of data.
293 : //
294 : // A blockIter provides an additional guarantee around key stability when a
295 : // block has a restart interval of 1 (i.e. when there is no prefix
296 : // compression). Key stability refers to whether the InternalKey.UserKey bytes
297 : // returned by a positioning call will remain stable after a subsequent
298 : // positioning call. The normal case is that a positioning call will invalidate
299 : // any previously returned InternalKey.UserKey. If a block has a restart
300 : // interval of 1 (no prefix compression), blockIter guarantees that
301 : // InternalKey.UserKey will point to the key as stored in the block itself
302 : // which will remain valid until the blockIter is closed. The key stability
303 : // guarantee is used by the range tombstone and range key code, which knows that
304 : // the respective blocks are always encoded with a restart interval of 1. This
305 : // per-block key stability guarantee is sufficient for range tombstones and
306 : // range deletes as they are always encoded in a single block.
307 : //
308 : // A blockIter also provides a value stability guarantee for range deletions and
309 : // range keys since there is only a single range deletion and range key block
310 : // per sstable and the blockIter will not release the bytes for the block until
311 : // it is closed.
312 : //
313 : // Note on why blockIter knows about lazyValueHandling:
314 : //
315 : // blockIter's positioning functions (that return a LazyValue), are too
316 : // complex to inline even prior to lazyValueHandling. blockIter.Next and
317 : // blockIter.First were by far the cheapest and had costs 195 and 180
318 : // respectively, which exceeds the budget of 80. We initially tried to keep
319 : // the lazyValueHandling logic out of blockIter by wrapping it with a
320 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
321 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
322 : // they called the corresponding blockIter func and then decided whether the
323 : // value was in fact in-place (so return immediately) or needed further
324 : // handling. But these also turned out too costly for mid-stack inlining since
325 : // simple calls like the following have a high cost that is barely under the
326 : // budget of 80
327 : //
328 : // k, v := i.data.SeekGE(key, flags) // cost 74
329 : // k, v := i.data.Next() // cost 72
330 : //
331 : // We have 2 options for minimizing performance regressions:
332 : // - Include the lazyValueHandling logic in the already non-inlineable
333 : // blockIter functions: Since most of the time is spent in data block iters,
334 : // it is acceptable to take the small hit of unnecessary branching (which
335 : // hopefully branch prediction will predict correctly) for other kinds of
336 : // blocks.
337 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
338 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
339 : // v3 sstable. We would want to manage these copies via code generation.
340 : //
341 : // We have picked the first option here.
342 : type blockIter struct {
343 : cmp Compare
344 : // offset is the byte index that marks where the current key/value is
345 : // encoded in the block.
346 : offset int32
347 : // nextOffset is the byte index where the next key/value is encoded in the
348 : // block.
349 : nextOffset int32
350 : // A "restart point" in a block is a point where the full key is encoded,
351 : // instead of just having a suffix of the key encoded. See readEntry() for
352 : // how prefix compression of keys works. Keys in between two restart points
353 : // only have a suffix encoded in the block. When restart interval is 1, no
354 : // prefix compression of keys happens. This is the case with range tombstone
355 : // blocks.
356 : //
357 : // All restart offsets are listed in increasing order in
358 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
359 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
360 : // therefore be seen as the point where data in the block ends, and a list
361 : // of offsets of all restart points begins.
362 : restarts int32
363 : // Number of restart points in this block. Encoded at the end of the block
364 : // as a uint32.
365 : numRestarts int32
366 : globalSeqNum uint64
367 : ptr unsafe.Pointer
368 : data []byte
369 : // key contains the raw key the iterator is currently pointed at. This may
370 : // point directly to data stored in the block (for a key which has no prefix
371 : // compression), to fullKey (for a prefix compressed key), or to a slice of
372 : // data stored in cachedBuf (during reverse iteration).
373 : key []byte
374 : // fullKey is a buffer used for key prefix decompression.
375 : fullKey []byte
376 : // val contains the value the iterator is currently pointed at. If non-nil,
377 : // this points to a slice of the block data.
378 : val []byte
379 : // lazyValue is val turned into a LazyValue, whenever a positioning method
380 : // returns a non-nil key-value pair.
381 : lazyValue base.LazyValue
382 : // ikey contains the decoded InternalKey the iterator is currently pointed
383 : // at. Note that the memory backing ikey.UserKey is either data stored
384 : // directly in the block, fullKey, or cachedBuf. The key stability guarantee
385 : // for blocks built with a restart interval of 1 is achieved by having
386 : // ikey.UserKey always point to data stored directly in the block.
387 : ikey InternalKey
388 : // cached and cachedBuf are used during reverse iteration. They are needed
389 : // because we can't perform prefix decoding in reverse, only in the forward
390 : // direction. In order to iterate in reverse, we decode and cache the entries
391 : // between two restart points.
392 : //
393 : // Note that cached[len(cached)-1] contains the previous entry to the one the
394 : // blockIter is currently pointed at. As usual, nextOffset will contain the
395 : // offset of the next entry. During reverse iteration, nextOffset will be
396 : // updated to point to offset, and we'll set the blockIter to point at the
397 : // entry cached[len(cached)-1]. See Prev() for more details.
398 : //
399 : // For a block encoded with a restart interval of 1, cached and cachedBuf
400 : // will not be used as there are no prefix compressed entries between the
401 : // restart points.
402 : cached []blockEntry
403 : cachedBuf []byte
404 : handle bufferHandle
405 : // for block iteration for already loaded blocks.
406 : firstUserKey []byte
407 : lazyValueHandling struct {
408 : vbr *valueBlockReader
409 : hasValuePrefix bool
410 : }
411 : hideObsoletePoints bool
412 : }
413 :
414 : // blockIter implements the base.InternalIterator interface.
415 : var _ base.InternalIterator = (*blockIter)(nil)
416 :
417 1 : func newBlockIter(cmp Compare, block block) (*blockIter, error) {
418 1 : i := &blockIter{}
419 1 : return i, i.init(cmp, block, 0, false)
420 1 : }
421 :
422 0 : func (i *blockIter) String() string {
423 0 : return "block"
424 0 : }
425 :
426 : func (i *blockIter) init(
427 : cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool,
428 1 : ) error {
429 1 : numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
430 1 : if numRestarts == 0 {
431 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
432 0 : }
433 1 : i.cmp = cmp
434 1 : i.restarts = int32(len(block)) - 4*(1+numRestarts)
435 1 : i.numRestarts = numRestarts
436 1 : i.globalSeqNum = globalSeqNum
437 1 : i.ptr = unsafe.Pointer(&block[0])
438 1 : i.data = block
439 1 : i.fullKey = i.fullKey[:0]
440 1 : i.val = nil
441 1 : i.hideObsoletePoints = hideObsoletePoints
442 1 : i.clearCache()
443 1 : if i.restarts > 0 {
444 1 : if err := i.readFirstKey(); err != nil {
445 0 : return err
446 0 : }
447 1 : } else {
448 1 : // Block is empty.
449 1 : i.firstUserKey = nil
450 1 : }
451 1 : return nil
452 : }
453 :
454 : // NB: two cases of hideObsoletePoints:
455 : // - Local sstable iteration: globalSeqNum will be set iff the sstable was
456 : // ingested.
457 : // - Foreign sstable iteration: globalSeqNum is always set.
458 : func (i *blockIter) initHandle(
459 : cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool,
460 1 : ) error {
461 1 : i.handle.Release()
462 1 : i.handle = block
463 1 : return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints)
464 1 : }
465 :
466 1 : func (i *blockIter) invalidate() {
467 1 : i.clearCache()
468 1 : i.offset = 0
469 1 : i.nextOffset = 0
470 1 : i.restarts = 0
471 1 : i.numRestarts = 0
472 1 : i.data = nil
473 1 : }
474 :
475 : // isDataInvalidated returns true when the blockIter has been invalidated
476 : // using an invalidate call. NB: this is different from blockIter.Valid
477 : // which is part of the InternalIterator implementation.
478 1 : func (i *blockIter) isDataInvalidated() bool {
479 1 : return i.data == nil
480 1 : }
481 :
482 1 : func (i *blockIter) resetForReuse() blockIter {
483 1 : return blockIter{
484 1 : fullKey: i.fullKey[:0],
485 1 : cached: i.cached[:0],
486 1 : cachedBuf: i.cachedBuf[:0],
487 1 : data: nil,
488 1 : }
489 1 : }
490 :
491 1 : func (i *blockIter) readEntry() {
492 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
493 1 :
494 1 : // This is an ugly performance hack. Reading entries from blocks is one of
495 1 : // the inner-most routines and decoding the 3 varints per-entry takes
496 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
497 1 : // us, so we do it manually. This provides a 10-15% performance improvement
498 1 : // on blockIter benchmarks on both go1.11 and go1.12.
499 1 : //
500 1 : // TODO(peter): remove this hack if go:inline is ever supported.
501 1 :
502 1 : var shared uint32
503 1 : if a := *((*uint8)(ptr)); a < 128 {
504 1 : shared = uint32(a)
505 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
506 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
507 0 : shared = uint32(b)<<7 | uint32(a)
508 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
509 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
510 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
511 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
512 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
513 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
514 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
515 0 : } else {
516 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
517 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
518 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
519 0 : }
520 :
521 1 : var unshared uint32
522 1 : if a := *((*uint8)(ptr)); a < 128 {
523 1 : unshared = uint32(a)
524 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
525 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
526 0 : unshared = uint32(b)<<7 | uint32(a)
527 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
528 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
529 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
530 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
531 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
532 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
533 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
534 0 : } else {
535 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
536 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
537 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
538 0 : }
539 :
540 1 : var value uint32
541 1 : if a := *((*uint8)(ptr)); a < 128 {
542 1 : value = uint32(a)
543 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
544 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
545 0 : value = uint32(b)<<7 | uint32(a)
546 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
547 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
548 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
549 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
550 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
551 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
552 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
553 0 : } else {
554 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
555 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
556 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
557 0 : }
558 :
559 1 : unsharedKey := getBytes(ptr, int(unshared))
560 1 : // TODO(sumeer): move this into the else block below.
561 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
562 1 : if shared == 0 {
563 1 : // Provide stability for the key across positioning calls if the key
564 1 : // doesn't share a prefix with the previous key. This removes requiring the
565 1 : // key to be copied if the caller knows the block has a restart interval of
566 1 : // 1. An important example of this is range-del blocks.
567 1 : i.key = unsharedKey
568 1 : } else {
569 1 : i.key = i.fullKey
570 1 : }
571 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
572 1 : i.val = getBytes(ptr, int(value))
573 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
574 : }
575 :
576 1 : func (i *blockIter) readFirstKey() error {
577 1 : ptr := i.ptr
578 1 :
579 1 : // This is an ugly performance hack. Reading entries from blocks is one of
580 1 : // the inner-most routines and decoding the 3 varints per-entry takes
581 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
582 1 : // us, so we do it manually. This provides a 10-15% performance improvement
583 1 : // on blockIter benchmarks on both go1.11 and go1.12.
584 1 : //
585 1 : // TODO(peter): remove this hack if go:inline is ever supported.
586 1 :
587 1 : if shared := *((*uint8)(ptr)); shared == 0 {
588 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
589 1 : } else {
590 0 : // The shared length is != 0, which is invalid.
591 0 : panic("first key in block must have zero shared length")
592 : }
593 :
594 1 : var unshared uint32
595 1 : if a := *((*uint8)(ptr)); a < 128 {
596 1 : unshared = uint32(a)
597 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
598 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
599 0 : unshared = uint32(b)<<7 | uint32(a)
600 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
601 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
602 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
603 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
604 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
605 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
606 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
607 0 : } else {
608 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
609 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
610 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
611 0 : }
612 :
613 : // Skip the value length.
614 1 : if a := *((*uint8)(ptr)); a < 128 {
615 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
616 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
617 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
618 0 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
619 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
620 0 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
621 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
622 0 : } else {
623 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
624 0 : }
625 :
626 1 : firstKey := getBytes(ptr, int(unshared))
627 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
628 1 : // BlockIter benchmarks.
629 1 : if n := len(firstKey) - 8; n >= 0 {
630 1 : i.firstUserKey = firstKey[:n:n]
631 1 : } else {
632 0 : i.firstUserKey = nil
633 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
634 0 : }
635 1 : return nil
636 : }
637 :
638 : // The sstable internal obsolete bit is set when writing a block and unset by
639 : // blockIter, so no code outside block writing/reading code ever sees it.
640 : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
641 : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
642 :
643 1 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
644 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
645 1 : // BlockIter benchmarks.
646 1 : if n := len(key) - 8; n >= 0 {
647 1 : trailer := binary.LittleEndian.Uint64(key[n:])
648 1 : hiddenPoint = i.hideObsoletePoints &&
649 1 : (trailer&trailerObsoleteBit != 0)
650 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
651 1 : i.ikey.UserKey = key[:n:n]
652 1 : if i.globalSeqNum != 0 {
653 1 : i.ikey.SetSeqNum(i.globalSeqNum)
654 1 : }
655 1 : } else {
656 1 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
657 1 : i.ikey.UserKey = nil
658 1 : }
659 1 : return hiddenPoint
660 : }
661 :
662 1 : func (i *blockIter) clearCache() {
663 1 : i.cached = i.cached[:0]
664 1 : i.cachedBuf = i.cachedBuf[:0]
665 1 : }
666 :
667 1 : func (i *blockIter) cacheEntry() {
668 1 : var valStart int32
669 1 : valSize := int32(len(i.val))
670 1 : if valSize > 0 {
671 1 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
672 1 : }
673 :
674 1 : i.cached = append(i.cached, blockEntry{
675 1 : offset: i.offset,
676 1 : keyStart: int32(len(i.cachedBuf)),
677 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
678 1 : valStart: valStart,
679 1 : valSize: valSize,
680 1 : })
681 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
682 : }
683 :
684 1 : func (i *blockIter) getFirstUserKey() []byte {
685 1 : return i.firstUserKey
686 1 : }
687 :
688 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
689 : // package.
690 1 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
691 1 : if invariants.Enabled && i.isDataInvalidated() {
692 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
693 : }
694 :
695 1 : i.clearCache()
696 1 : // Find the index of the smallest restart point whose key is > the key
697 1 : // sought; index will be numRestarts if there is no such restart point.
698 1 : i.offset = 0
699 1 : var index int32
700 1 :
701 1 : {
702 1 : // NB: manually inlined sort.Seach is ~5% faster.
703 1 : //
704 1 : // Define f(-1) == false and f(n) == true.
705 1 : // Invariant: f(index-1) == false, f(upper) == true.
706 1 : upper := i.numRestarts
707 1 : for index < upper {
708 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
709 1 : // index ≤ h < upper
710 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
711 1 : // For a restart point, there are 0 bytes shared with the previous key.
712 1 : // The varint encoding of 0 occupies 1 byte.
713 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
714 1 :
715 1 : // Decode the key at that restart point, and compare it to the key
716 1 : // sought. See the comment in readEntry for why we manually inline the
717 1 : // varint decoding.
718 1 : var v1 uint32
719 1 : if a := *((*uint8)(ptr)); a < 128 {
720 1 : v1 = uint32(a)
721 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
722 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
723 0 : v1 = uint32(b)<<7 | uint32(a)
724 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
725 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
726 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
727 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
728 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
729 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
730 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
731 0 : } else {
732 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
733 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
734 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
735 0 : }
736 :
737 1 : if *((*uint8)(ptr)) < 128 {
738 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
739 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
740 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
741 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
742 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
743 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
744 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
745 0 : } else {
746 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
747 0 : }
748 :
749 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
750 : // speedup on BlockIter benchmarks.
751 1 : s := getBytes(ptr, int(v1))
752 1 : var k []byte
753 1 : if n := len(s) - 8; n >= 0 {
754 1 : k = s[:n:n]
755 1 : }
756 : // Else k is invalid, and left as nil
757 :
758 1 : if i.cmp(key, k) > 0 {
759 1 : // The search key is greater than the user key at this restart point.
760 1 : // Search beyond this restart point, since we are trying to find the
761 1 : // first restart point with a user key >= the search key.
762 1 : index = h + 1 // preserves f(i-1) == false
763 1 : } else {
764 1 : // k >= search key, so prune everything after index (since index
765 1 : // satisfies the property we are looking for).
766 1 : upper = h // preserves f(j) == true
767 1 : }
768 : }
769 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
770 : // => answer is index.
771 : }
772 :
773 : // index is the first restart point with key >= search key. Define the keys
774 : // between a restart point and the next restart point as belonging to that
775 : // restart point.
776 : //
777 : // Since keys are strictly increasing, if index > 0 then the restart point
778 : // at index-1 will be the first one that has some keys belonging to it that
779 : // could be equal to the search key. If index == 0, then all keys in this
780 : // block are larger than the key sought, and offset remains at zero.
781 1 : if index > 0 {
782 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
783 1 : }
784 1 : i.readEntry()
785 1 : hiddenPoint := i.decodeInternalKey(i.key)
786 1 :
787 1 : // Iterate from that restart point to somewhere >= the key sought.
788 1 : if !i.valid() {
789 0 : return nil, base.LazyValue{}
790 0 : }
791 1 : if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
792 1 : // Initialize i.lazyValue
793 1 : if !i.lazyValueHandling.hasValuePrefix ||
794 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
795 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
796 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
797 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
798 1 : } else {
799 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
800 1 : }
801 1 : return &i.ikey, i.lazyValue
802 : }
803 1 : for i.Next(); i.valid(); i.Next() {
804 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
805 1 : // i.Next() has already initialized i.lazyValue.
806 1 : return &i.ikey, i.lazyValue
807 1 : }
808 : }
809 1 : return nil, base.LazyValue{}
810 : }
811 :
812 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
813 : // pebble package.
814 : func (i *blockIter) SeekPrefixGE(
815 : prefix, key []byte, flags base.SeekGEFlags,
816 0 : ) (*base.InternalKey, base.LazyValue) {
817 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
818 0 : panic("pebble: SeekPrefixGE unimplemented")
819 : }
820 :
821 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
822 : // package.
823 1 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
824 1 : if invariants.Enabled && i.isDataInvalidated() {
825 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
826 : }
827 :
828 1 : i.clearCache()
829 1 : // Find the index of the smallest restart point whose key is >= the key
830 1 : // sought; index will be numRestarts if there is no such restart point.
831 1 : i.offset = 0
832 1 : var index int32
833 1 :
834 1 : {
835 1 : // NB: manually inlined sort.Search is ~5% faster.
836 1 : //
837 1 : // Define f(-1) == false and f(n) == true.
838 1 : // Invariant: f(index-1) == false, f(upper) == true.
839 1 : upper := i.numRestarts
840 1 : for index < upper {
841 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
842 1 : // index ≤ h < upper
843 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
844 1 : // For a restart point, there are 0 bytes shared with the previous key.
845 1 : // The varint encoding of 0 occupies 1 byte.
846 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
847 1 :
848 1 : // Decode the key at that restart point, and compare it to the key
849 1 : // sought. See the comment in readEntry for why we manually inline the
850 1 : // varint decoding.
851 1 : var v1 uint32
852 1 : if a := *((*uint8)(ptr)); a < 128 {
853 1 : v1 = uint32(a)
854 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
855 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
856 0 : v1 = uint32(b)<<7 | uint32(a)
857 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
858 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
859 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
860 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
861 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
862 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
863 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
864 0 : } else {
865 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
866 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
867 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
868 0 : }
869 :
870 1 : if *((*uint8)(ptr)) < 128 {
871 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
872 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
873 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
874 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
875 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
876 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
877 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
878 0 : } else {
879 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
880 0 : }
881 :
882 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
883 : // speedup on BlockIter benchmarks.
884 1 : s := getBytes(ptr, int(v1))
885 1 : var k []byte
886 1 : if n := len(s) - 8; n >= 0 {
887 1 : k = s[:n:n]
888 1 : }
889 : // Else k is invalid, and left as nil
890 :
891 1 : if i.cmp(key, k) > 0 {
892 1 : // The search key is greater than the user key at this restart point.
893 1 : // Search beyond this restart point, since we are trying to find the
894 1 : // first restart point with a user key >= the search key.
895 1 : index = h + 1 // preserves f(i-1) == false
896 1 : } else {
897 1 : // k >= search key, so prune everything after index (since index
898 1 : // satisfies the property we are looking for).
899 1 : upper = h // preserves f(j) == true
900 1 : }
901 : }
902 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
903 : // => answer is index.
904 : }
905 :
906 : // index is the first restart point with key >= search key. Define the keys
907 : // between a restart point and the next restart point as belonging to that
908 : // restart point. Note that index could be equal to i.numRestarts, i.e., we
909 : // are past the last restart.
910 : //
911 : // Since keys are strictly increasing, if index > 0 then the restart point
912 : // at index-1 will be the first one that has some keys belonging to it that
913 : // are less than the search key. If index == 0, then all keys in this block
914 : // are larger than the search key, so there is no match.
915 1 : targetOffset := i.restarts
916 1 : if index > 0 {
917 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
918 1 : if index < i.numRestarts {
919 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
920 1 : }
921 1 : } else if index == 0 {
922 1 : // If index == 0 then all keys in this block are larger than the key
923 1 : // sought.
924 1 : i.offset = -1
925 1 : i.nextOffset = 0
926 1 : return nil, base.LazyValue{}
927 1 : }
928 :
929 : // Iterate from that restart point to somewhere >= the key sought, then back
930 : // up to the previous entry. The expectation is that we'll be performing
931 : // reverse iteration, so we cache the entries as we advance forward.
932 1 : i.nextOffset = i.offset
933 1 :
934 1 : for {
935 1 : i.offset = i.nextOffset
936 1 : i.readEntry()
937 1 : // When hidden keys are common, there is additional optimization possible
938 1 : // by not caching entries that are hidden (note that some calls to
939 1 : // cacheEntry don't decode the internal key before caching, but checking
940 1 : // whether a key is hidden does not require full decoding). However, we do
941 1 : // need to use the blockEntry.offset in the cache for the first entry at
942 1 : // the reset point to do the binary search when the cache is empty -- so
943 1 : // we would need to cache that first entry (though not the key) even if
944 1 : // was hidden. Our current assumption is that if there are large numbers
945 1 : // of hidden keys we will be able to skip whole blocks (using block
946 1 : // property filters) so we don't bother optimizing.
947 1 : hiddenPoint := i.decodeInternalKey(i.key)
948 1 :
949 1 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
950 1 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
951 1 : // that we can reverse.
952 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
953 1 : // The current key is greater than or equal to our search key. Back up to
954 1 : // the previous key which was less than our search key. Note that this for
955 1 : // loop will execute at least once with this if-block not being true, so
956 1 : // the key we are backing up to is the last one this loop cached.
957 1 : return i.Prev()
958 1 : }
959 :
960 1 : if i.nextOffset >= targetOffset {
961 1 : // We've reached the end of the current restart block. Return the
962 1 : // current key if not hidden, else call Prev().
963 1 : //
964 1 : // When the restart interval is 1, the first iteration of the for loop
965 1 : // will bring us here. In that case ikey is backed by the block so we
966 1 : // get the desired key stability guarantee for the lifetime of the
967 1 : // blockIter. That is, we never cache anything and therefore never
968 1 : // return a key backed by cachedBuf.
969 1 : if hiddenPoint {
970 1 : return i.Prev()
971 1 : }
972 1 : break
973 : }
974 :
975 1 : i.cacheEntry()
976 : }
977 :
978 1 : if !i.valid() {
979 1 : return nil, base.LazyValue{}
980 1 : }
981 1 : if !i.lazyValueHandling.hasValuePrefix ||
982 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
983 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
984 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
985 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
986 1 : } else {
987 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
988 1 : }
989 1 : return &i.ikey, i.lazyValue
990 : }
991 :
992 : // First implements internalIterator.First, as documented in the pebble
993 : // package.
994 1 : func (i *blockIter) First() (*InternalKey, base.LazyValue) {
995 1 : if invariants.Enabled && i.isDataInvalidated() {
996 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
997 : }
998 :
999 1 : i.offset = 0
1000 1 : if !i.valid() {
1001 1 : return nil, base.LazyValue{}
1002 1 : }
1003 1 : i.clearCache()
1004 1 : i.readEntry()
1005 1 : hiddenPoint := i.decodeInternalKey(i.key)
1006 1 : if hiddenPoint {
1007 1 : return i.Next()
1008 1 : }
1009 1 : if !i.lazyValueHandling.hasValuePrefix ||
1010 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1011 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1012 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1013 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1014 1 : } else {
1015 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1016 1 : }
1017 1 : return &i.ikey, i.lazyValue
1018 : }
1019 :
1020 1 : func decodeRestart(b []byte) int32 {
1021 1 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
1022 1 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
1023 1 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
1024 1 : }
1025 :
1026 : // Last implements internalIterator.Last, as documented in the pebble package.
1027 1 : func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
1028 1 : if invariants.Enabled && i.isDataInvalidated() {
1029 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
1030 : }
1031 :
1032 : // Seek forward from the last restart point.
1033 1 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
1034 1 : if !i.valid() {
1035 1 : return nil, base.LazyValue{}
1036 1 : }
1037 :
1038 1 : i.readEntry()
1039 1 : i.clearCache()
1040 1 :
1041 1 : for i.nextOffset < i.restarts {
1042 1 : i.cacheEntry()
1043 1 : i.offset = i.nextOffset
1044 1 : i.readEntry()
1045 1 : }
1046 :
1047 1 : hiddenPoint := i.decodeInternalKey(i.key)
1048 1 : if hiddenPoint {
1049 1 : return i.Prev()
1050 1 : }
1051 1 : if !i.lazyValueHandling.hasValuePrefix ||
1052 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1053 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1054 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1055 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1056 1 : } else {
1057 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1058 1 : }
1059 1 : return &i.ikey, i.lazyValue
1060 : }
1061 :
1062 : // Next implements internalIterator.Next, as documented in the pebble
1063 : // package.
1064 1 : func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
1065 1 : if len(i.cachedBuf) > 0 {
1066 1 : // We're switching from reverse iteration to forward iteration. We need to
1067 1 : // populate i.fullKey with the current key we're positioned at so that
1068 1 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1069 1 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1070 1 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1071 1 : // copying into i.fullKey works for both cases.
1072 1 : //
1073 1 : // TODO(peter): Rather than clearing the cache, we could instead use the
1074 1 : // cache until it is exhausted. This would likely be faster than falling
1075 1 : // through to the normal forward iteration code below.
1076 1 : i.fullKey = append(i.fullKey[:0], i.key...)
1077 1 : i.clearCache()
1078 1 : }
1079 :
1080 : start:
1081 1 : i.offset = i.nextOffset
1082 1 : if !i.valid() {
1083 1 : return nil, base.LazyValue{}
1084 1 : }
1085 1 : i.readEntry()
1086 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1087 1 : if n := len(i.key) - 8; n >= 0 {
1088 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1089 1 : hiddenPoint := i.hideObsoletePoints &&
1090 1 : (trailer&trailerObsoleteBit != 0)
1091 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1092 1 : i.ikey.UserKey = i.key[:n:n]
1093 1 : if i.globalSeqNum != 0 {
1094 1 : i.ikey.SetSeqNum(i.globalSeqNum)
1095 1 : }
1096 1 : if hiddenPoint {
1097 1 : goto start
1098 : }
1099 0 : } else {
1100 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1101 0 : i.ikey.UserKey = nil
1102 0 : }
1103 1 : if !i.lazyValueHandling.hasValuePrefix ||
1104 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1105 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1106 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1107 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1108 1 : } else {
1109 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1110 1 : }
1111 1 : return &i.ikey, i.lazyValue
1112 : }
1113 :
1114 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1115 1 : func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
1116 1 : if i.lazyValueHandling.hasValuePrefix {
1117 1 : return i.nextPrefixV3(succKey)
1118 1 : }
1119 1 : const nextsBeforeSeek = 3
1120 1 : k, v := i.Next()
1121 1 : for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
1122 1 : if j >= nextsBeforeSeek {
1123 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1124 1 : }
1125 1 : k, v = i.Next()
1126 : }
1127 1 : return k, v
1128 : }
1129 :
1130 1 : func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
1131 1 : // Doing nexts that involve a key comparison can be expensive (and the cost
1132 1 : // depends on the key length), so we use the same threshold of 3 that we use
1133 1 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1134 1 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1135 1 : // than doing a SeekGE within the block, so we do this 16 times
1136 1 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1137 1 : // search for the restart consumes > 100ns. If the number of versions is >
1138 1 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1139 1 : // on average need to find a key between two restarts, so another 8 steps
1140 1 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1141 1 : // steps.
1142 1 : //
1143 1 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1144 1 : // was written (which we don't currently store) instead of the default value
1145 1 : // of 16.
1146 1 : const nextCmpThresholdBeforeSeek = 3
1147 1 : const nextFastThresholdBeforeRestarts = 16
1148 1 : nextCmpCount := 0
1149 1 : nextFastCount := 0
1150 1 : usedRestarts := false
1151 1 : // INVARIANT: blockIter is valid.
1152 1 : if invariants.Enabled && !i.valid() {
1153 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1154 : }
1155 1 : prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
1156 1 : for {
1157 1 : i.offset = i.nextOffset
1158 1 : if !i.valid() {
1159 1 : return nil, base.LazyValue{}
1160 1 : }
1161 : // Need to decode the length integers, so we can compute nextOffset.
1162 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1163 1 : // This is an ugly performance hack. Reading entries from blocks is one of
1164 1 : // the inner-most routines and decoding the 3 varints per-entry takes
1165 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1166 1 : // us, so we do it manually. This provides a 10-15% performance improvement
1167 1 : // on blockIter benchmarks on both go1.11 and go1.12.
1168 1 : //
1169 1 : // TODO(peter): remove this hack if go:inline is ever supported.
1170 1 :
1171 1 : // Decode the shared key length integer.
1172 1 : var shared uint32
1173 1 : if a := *((*uint8)(ptr)); a < 128 {
1174 1 : shared = uint32(a)
1175 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1176 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1177 0 : shared = uint32(b)<<7 | uint32(a)
1178 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1179 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1180 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1181 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1182 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1183 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1184 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1185 0 : } else {
1186 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1187 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1188 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1189 0 : }
1190 : // Decode the unshared key length integer.
1191 1 : var unshared uint32
1192 1 : if a := *((*uint8)(ptr)); a < 128 {
1193 1 : unshared = uint32(a)
1194 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1195 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1196 0 : unshared = uint32(b)<<7 | uint32(a)
1197 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1198 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1199 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1200 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1201 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1202 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1203 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1204 0 : } else {
1205 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1206 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1207 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1208 0 : }
1209 : // Decode the value length integer.
1210 1 : var value uint32
1211 1 : if a := *((*uint8)(ptr)); a < 128 {
1212 1 : value = uint32(a)
1213 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1214 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1215 0 : value = uint32(b)<<7 | uint32(a)
1216 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1217 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1218 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1219 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1220 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1221 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1222 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1223 0 : } else {
1224 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1225 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1226 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1227 0 : }
1228 : // The starting position of the value.
1229 1 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1230 1 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1231 1 : if invariants.Enabled && unshared < 8 {
1232 0 : // This should not happen since only the key prefix is shared, so even
1233 0 : // if the prefix length is the same as the user key length, the unshared
1234 0 : // will include the trailer.
1235 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1236 : }
1237 : // The trailer is written in little endian, so the key kind is the first
1238 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1239 1 : keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1240 1 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1241 1 : prefixChanged := false
1242 1 : if keyKind == InternalKeyKindSet {
1243 1 : if invariants.Enabled && value == 0 {
1244 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1245 : }
1246 1 : valPrefix := *((*valuePrefix)(valuePtr))
1247 1 : if setHasSamePrefix(valPrefix) {
1248 1 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1249 1 : // that subsequent keys will not have a shared length that is greater
1250 1 : // than the prefix of the current key, which is also the prefix of
1251 1 : // i.key. Since we are continuing to iterate, we don't need to
1252 1 : // initialize i.ikey and i.lazyValue (these are initialized before
1253 1 : // returning).
1254 1 : nextFastCount++
1255 1 : if nextFastCount > nextFastThresholdBeforeRestarts {
1256 1 : if usedRestarts {
1257 0 : // Exhausted iteration budget. This will never happen unless
1258 0 : // someone is using a restart interval > 16. It is just to guard
1259 0 : // against long restart intervals causing too much iteration.
1260 0 : break
1261 : }
1262 : // Haven't used restarts yet, so find the first restart at or beyond
1263 : // the current offset.
1264 1 : targetOffset := i.offset
1265 1 : var index int32
1266 1 : {
1267 1 : // NB: manually inlined sort.Sort is ~5% faster.
1268 1 : //
1269 1 : // f defined for a restart point is true iff the offset >=
1270 1 : // targetOffset.
1271 1 : // Define f(-1) == false and f(i.numRestarts) == true.
1272 1 : // Invariant: f(index-1) == false, f(upper) == true.
1273 1 : upper := i.numRestarts
1274 1 : for index < upper {
1275 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1276 1 : // index ≤ h < upper
1277 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1278 1 : if offset < targetOffset {
1279 1 : index = h + 1 // preserves f(index-1) == false
1280 1 : } else {
1281 1 : upper = h // preserves f(upper) == true
1282 1 : }
1283 : }
1284 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1285 : // => answer is index.
1286 : }
1287 1 : usedRestarts = true
1288 1 : nextFastCount = 0
1289 1 : if index == i.numRestarts {
1290 1 : // Already past the last real restart, so iterate a bit more until
1291 1 : // we are done with the block.
1292 1 : continue
1293 : }
1294 : // Have some real restarts after index. NB: index is the first
1295 : // restart at or beyond the current offset.
1296 1 : startingIndex := index
1297 1 : for index != i.numRestarts &&
1298 1 : // The restart at index is 4 bytes written in little endian format
1299 1 : // starting at i.restart+4*index. The 0th byte is the least
1300 1 : // significant and the 3rd byte is the most significant. Since the
1301 1 : // most significant bit of the 3rd byte is what we use for
1302 1 : // encoding the set-has-same-prefix information, the indexing
1303 1 : // below has +3.
1304 1 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1305 1 : // We still have the same prefix, so move to the next restart.
1306 1 : index++
1307 1 : }
1308 : // index is the first restart that did not have the same prefix.
1309 1 : if index != startingIndex {
1310 1 : // Managed to skip past at least one restart. Resume iteration
1311 1 : // from index-1. Since nextFastCount has been reset to 0, we
1312 1 : // should be able to iterate to the next prefix.
1313 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1314 1 : i.readEntry()
1315 1 : }
1316 : // Else, unable to skip past any restart. Resume iteration. Since
1317 : // nextFastCount has been reset to 0, we should be able to iterate
1318 : // to the next prefix.
1319 1 : continue
1320 : }
1321 1 : continue
1322 1 : } else if prevKeyIsSet {
1323 1 : prefixChanged = true
1324 1 : }
1325 1 : } else {
1326 1 : prevKeyIsSet = false
1327 1 : }
1328 : // Slow-path cases:
1329 : // - (Likely) The prefix has changed.
1330 : // - (Unlikely) The prefix has not changed.
1331 : // We assemble the key etc. under the assumption that it is the likely
1332 : // case.
1333 1 : unsharedKey := getBytes(ptr, int(unshared))
1334 1 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1335 1 : // since the current logic assumes we have always copied the latest key
1336 1 : // into fullKey, which is why when we get to the next key we can (a)
1337 1 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1338 1 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1339 1 : // valid (even if unshared). For (b), we will need to remember whether
1340 1 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1341 1 : // in the former case and for the latter case need to copy the shared part
1342 1 : // too. This same comment applies to the other place where we can do this
1343 1 : // optimization, in readEntry().
1344 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1345 1 : i.val = getBytes(valuePtr, int(value))
1346 1 : if shared == 0 {
1347 1 : // Provide stability for the key across positioning calls if the key
1348 1 : // doesn't share a prefix with the previous key. This removes requiring the
1349 1 : // key to be copied if the caller knows the block has a restart interval of
1350 1 : // 1. An important example of this is range-del blocks.
1351 1 : i.key = unsharedKey
1352 1 : } else {
1353 1 : i.key = i.fullKey
1354 1 : }
1355 : // Manually inlined version of i.decodeInternalKey(i.key).
1356 1 : hiddenPoint := false
1357 1 : if n := len(i.key) - 8; n >= 0 {
1358 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1359 1 : hiddenPoint = i.hideObsoletePoints &&
1360 1 : (trailer&trailerObsoleteBit != 0)
1361 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1362 1 : i.ikey.UserKey = i.key[:n:n]
1363 1 : if i.globalSeqNum != 0 {
1364 0 : i.ikey.SetSeqNum(i.globalSeqNum)
1365 0 : }
1366 0 : } else {
1367 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1368 0 : i.ikey.UserKey = nil
1369 0 : }
1370 1 : nextCmpCount++
1371 1 : if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
1372 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1373 0 : i.ikey.UserKey, succKey))
1374 : }
1375 1 : if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
1376 1 : // Prefix has changed.
1377 1 : if hiddenPoint {
1378 1 : return i.Next()
1379 1 : }
1380 1 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1381 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1382 : }
1383 1 : if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1384 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1385 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1386 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1387 1 : } else {
1388 0 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1389 0 : }
1390 1 : return &i.ikey, i.lazyValue
1391 : }
1392 : // Else prefix has not changed.
1393 :
1394 1 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1395 1 : break
1396 : }
1397 : }
1398 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1399 : }
1400 :
1401 : // Prev implements internalIterator.Prev, as documented in the pebble
1402 : // package.
1403 1 : func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
1404 1 : start:
1405 1 : for n := len(i.cached) - 1; n >= 0; n-- {
1406 1 : i.nextOffset = i.offset
1407 1 : e := &i.cached[n]
1408 1 : i.offset = e.offset
1409 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1410 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1411 1 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1412 1 : if n := len(i.key) - 8; n >= 0 {
1413 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1414 1 : hiddenPoint := i.hideObsoletePoints &&
1415 1 : (trailer&trailerObsoleteBit != 0)
1416 1 : if hiddenPoint {
1417 1 : continue
1418 : }
1419 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1420 1 : i.ikey.UserKey = i.key[:n:n]
1421 1 : if i.globalSeqNum != 0 {
1422 1 : i.ikey.SetSeqNum(i.globalSeqNum)
1423 1 : }
1424 0 : } else {
1425 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1426 0 : i.ikey.UserKey = nil
1427 0 : }
1428 1 : i.cached = i.cached[:n]
1429 1 : if !i.lazyValueHandling.hasValuePrefix ||
1430 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1431 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1432 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1433 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1434 1 : } else {
1435 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1436 1 : }
1437 1 : return &i.ikey, i.lazyValue
1438 : }
1439 :
1440 1 : i.clearCache()
1441 1 : if i.offset <= 0 {
1442 1 : i.offset = -1
1443 1 : i.nextOffset = 0
1444 1 : return nil, base.LazyValue{}
1445 1 : }
1446 :
1447 1 : targetOffset := i.offset
1448 1 : var index int32
1449 1 :
1450 1 : {
1451 1 : // NB: manually inlined sort.Sort is ~5% faster.
1452 1 : //
1453 1 : // Define f(-1) == false and f(n) == true.
1454 1 : // Invariant: f(index-1) == false, f(upper) == true.
1455 1 : upper := i.numRestarts
1456 1 : for index < upper {
1457 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1458 1 : // index ≤ h < upper
1459 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1460 1 : if offset < targetOffset {
1461 1 : // Looking for the first restart that has offset >= targetOffset, so
1462 1 : // ignore h and earlier.
1463 1 : index = h + 1 // preserves f(i-1) == false
1464 1 : } else {
1465 1 : upper = h // preserves f(j) == true
1466 1 : }
1467 : }
1468 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1469 : // => answer is index.
1470 : }
1471 :
1472 : // index is first restart with offset >= targetOffset. Note that
1473 : // targetOffset may not be at a restart point since one can call Prev()
1474 : // after Next() (so the cache was not populated) and targetOffset refers to
1475 : // the current entry. index-1 must have an offset < targetOffset (it can't
1476 : // be equal to targetOffset since the binary search would have selected that
1477 : // as the index).
1478 1 : i.offset = 0
1479 1 : if index > 0 {
1480 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1481 1 : }
1482 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1483 : // valid offset.
1484 :
1485 1 : i.readEntry()
1486 1 :
1487 1 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1488 1 : // entry we are stepping back from, and we don't need to cache the entry
1489 1 : // before it, since it is the candidate to return.
1490 1 : for i.nextOffset < targetOffset {
1491 1 : i.cacheEntry()
1492 1 : i.offset = i.nextOffset
1493 1 : i.readEntry()
1494 1 : }
1495 :
1496 1 : hiddenPoint := i.decodeInternalKey(i.key)
1497 1 : if hiddenPoint {
1498 1 : // Use the cache.
1499 1 : goto start
1500 : }
1501 1 : if !i.lazyValueHandling.hasValuePrefix ||
1502 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1503 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1504 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1505 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1506 1 : } else {
1507 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1508 1 : }
1509 1 : return &i.ikey, i.lazyValue
1510 : }
1511 :
1512 : // Key implements internalIterator.Key, as documented in the pebble package.
1513 1 : func (i *blockIter) Key() *InternalKey {
1514 1 : return &i.ikey
1515 1 : }
1516 :
1517 1 : func (i *blockIter) value() base.LazyValue {
1518 1 : return i.lazyValue
1519 1 : }
1520 :
1521 : // Error implements internalIterator.Error, as documented in the pebble
1522 : // package.
1523 1 : func (i *blockIter) Error() error {
1524 1 : return nil // infallible
1525 1 : }
1526 :
1527 : // Close implements internalIterator.Close, as documented in the pebble
1528 : // package.
1529 1 : func (i *blockIter) Close() error {
1530 1 : i.handle.Release()
1531 1 : i.handle = bufferHandle{}
1532 1 : i.val = nil
1533 1 : i.lazyValue = base.LazyValue{}
1534 1 : i.lazyValueHandling.vbr = nil
1535 1 : return nil
1536 1 : }
1537 :
1538 0 : func (i *blockIter) SetBounds(lower, upper []byte) {
1539 0 : // This should never be called as bounds are handled by sstable.Iterator.
1540 0 : panic("pebble: SetBounds unimplemented")
1541 : }
1542 :
1543 1 : func (i *blockIter) valid() bool {
1544 1 : return i.offset >= 0 && i.offset < i.restarts
1545 1 : }
1546 :
1547 : // fragmentBlockIter wraps a blockIter, implementing the
1548 : // keyspan.FragmentIterator interface. It's used for reading range deletion and
1549 : // range key blocks.
1550 : //
1551 : // Range deletions and range keys are fragmented before they're persisted to the
1552 : // block. Overlapping fragments have identical bounds. The fragmentBlockIter
1553 : // gathers all the fragments with identical bounds within a block and returns a
1554 : // single keyspan.Span describing all the keys defined over the span.
1555 : //
1556 : // # Memory lifetime
1557 : //
1558 : // A Span returned by fragmentBlockIter is only guaranteed to be stable until
1559 : // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
1560 : // may be reused, so the user must not assume it's stable.
1561 : //
1562 : // Blocks holding range deletions and range keys are configured to use a restart
1563 : // interval of 1. This provides key stability. The caller may treat the various
1564 : // byte slices (start, end, suffix, value) as stable for the lifetime of the
1565 : // iterator.
1566 : type fragmentBlockIter struct {
1567 : blockIter blockIter
1568 : keyBuf [2]keyspan.Key
1569 : span keyspan.Span
1570 : err error
1571 : dir int8
1572 : closeHook func(i keyspan.FragmentIterator) error
1573 :
1574 : // elideSameSeqnum, if true, returns only the first-occurring (in forward
1575 : // order) Key for each sequence number.
1576 : elideSameSeqnum bool
1577 : }
1578 :
1579 1 : func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
1580 1 : return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
1581 1 : }
1582 :
1583 1 : func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) {
1584 1 : // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
1585 1 : // calls to Decode is too confusing and subtle. Refactor to make it
1586 1 : // explicit.
1587 1 :
1588 1 : // decode the contents of the fragment's value. This always includes at
1589 1 : // least the end key: RANGEDELs store the end key directly as the value,
1590 1 : // whereas the various range key kinds store are more complicated. The
1591 1 : // details of the range key internal value format are documented within the
1592 1 : // internal/rangekey package.
1593 1 : switch k.Kind() {
1594 1 : case base.InternalKeyKindRangeDelete:
1595 1 : i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
1596 1 : i.err = nil
1597 1 : case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
1598 1 : i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys)
1599 0 : default:
1600 0 : i.span = keyspan.Span{}
1601 0 : i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
1602 : }
1603 : }
1604 :
1605 1 : func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
1606 1 : if invariants.Enabled {
1607 1 : if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
1608 0 : panic("elideKeysOfSameSeqNum called when it should not be")
1609 : }
1610 : }
1611 1 : lastSeqNum := i.span.Keys[0].SeqNum()
1612 1 : k := 1
1613 1 : for j := 1; j < len(i.span.Keys); j++ {
1614 1 : if lastSeqNum != i.span.Keys[j].SeqNum() {
1615 1 : lastSeqNum = i.span.Keys[j].SeqNum()
1616 1 : i.span.Keys[k] = i.span.Keys[j]
1617 1 : k++
1618 1 : }
1619 : }
1620 1 : i.span.Keys = i.span.Keys[:k]
1621 : }
1622 :
1623 : // gatherForward gathers internal keys with identical bounds. Keys defined over
1624 : // spans of the keyspace are fragmented such that any overlapping key spans have
1625 : // identical bounds. When these spans are persisted to a range deletion or range
1626 : // key block, they may be persisted as multiple internal keys in order to encode
1627 : // multiple sequence numbers or key kinds.
1628 : //
1629 : // gatherForward iterates forward, re-combining the fragmented internal keys to
1630 : // reconstruct a keyspan.Span that holds all the keys defined over the span.
1631 1 : func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
1632 1 : i.span = keyspan.Span{}
1633 1 : if k == nil || !i.blockIter.valid() {
1634 1 : return nil
1635 1 : }
1636 1 : i.err = nil
1637 1 : // Use the i.keyBuf array to back the Keys slice to prevent an allocation
1638 1 : // when a span contains few keys.
1639 1 : i.span.Keys = i.keyBuf[:0]
1640 1 :
1641 1 : // Decode the span's end key and individual keys from the value.
1642 1 : internalValue := lazyValue.InPlaceValue()
1643 1 : i.decodeSpanKeys(k, internalValue)
1644 1 : if i.err != nil {
1645 0 : return nil
1646 0 : }
1647 1 : prevEnd := i.span.End
1648 1 :
1649 1 : // There might exist additional internal keys with identical bounds encoded
1650 1 : // within the block. Iterate forward, accumulating all the keys with
1651 1 : // identical bounds to s.
1652 1 : k, lazyValue = i.blockIter.Next()
1653 1 : internalValue = lazyValue.InPlaceValue()
1654 1 : for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
1655 1 : i.decodeSpanKeys(k, internalValue)
1656 1 : if i.err != nil {
1657 0 : return nil
1658 0 : }
1659 :
1660 : // Since k indicates an equal start key, the encoded end key must
1661 : // exactly equal the original end key from the first internal key.
1662 : // Overlapping fragments are required to have exactly equal start and
1663 : // end bounds.
1664 1 : if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
1665 0 : i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
1666 0 : i.span = keyspan.Span{}
1667 0 : return nil
1668 0 : }
1669 1 : k, lazyValue = i.blockIter.Next()
1670 1 : internalValue = lazyValue.InPlaceValue()
1671 : }
1672 1 : if i.elideSameSeqnum && len(i.span.Keys) > 0 {
1673 1 : i.elideKeysOfSameSeqNum()
1674 1 : }
1675 : // i.blockIter is positioned over the first internal key for the next span.
1676 1 : return &i.span
1677 : }
1678 :
1679 : // gatherBackward gathers internal keys with identical bounds. Keys defined over
1680 : // spans of the keyspace are fragmented such that any overlapping key spans have
1681 : // identical bounds. When these spans are persisted to a range deletion or range
1682 : // key block, they may be persisted as multiple internal keys in order to encode
1683 : // multiple sequence numbers or key kinds.
1684 : //
1685 : // gatherBackward iterates backwards, re-combining the fragmented internal keys
1686 : // to reconstruct a keyspan.Span that holds all the keys defined over the span.
1687 1 : func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
1688 1 : i.span = keyspan.Span{}
1689 1 : if k == nil || !i.blockIter.valid() {
1690 1 : return nil
1691 1 : }
1692 1 : i.err = nil
1693 1 : // Use the i.keyBuf array to back the Keys slice to prevent an allocation
1694 1 : // when a span contains few keys.
1695 1 : i.span.Keys = i.keyBuf[:0]
1696 1 :
1697 1 : // Decode the span's end key and individual keys from the value.
1698 1 : internalValue := lazyValue.InPlaceValue()
1699 1 : i.decodeSpanKeys(k, internalValue)
1700 1 : if i.err != nil {
1701 0 : return nil
1702 0 : }
1703 1 : prevEnd := i.span.End
1704 1 :
1705 1 : // There might exist additional internal keys with identical bounds encoded
1706 1 : // within the block. Iterate backward, accumulating all the keys with
1707 1 : // identical bounds to s.
1708 1 : k, lazyValue = i.blockIter.Prev()
1709 1 : internalValue = lazyValue.InPlaceValue()
1710 1 : for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
1711 1 : i.decodeSpanKeys(k, internalValue)
1712 1 : if i.err != nil {
1713 0 : return nil
1714 0 : }
1715 :
1716 : // Since k indicates an equal start key, the encoded end key must
1717 : // exactly equal the original end key from the first internal key.
1718 : // Overlapping fragments are required to have exactly equal start and
1719 : // end bounds.
1720 1 : if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
1721 0 : i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
1722 0 : i.span = keyspan.Span{}
1723 0 : return nil
1724 0 : }
1725 1 : k, lazyValue = i.blockIter.Prev()
1726 1 : internalValue = lazyValue.InPlaceValue()
1727 : }
1728 : // i.blockIter is positioned over the last internal key for the previous
1729 : // span.
1730 :
1731 : // Backwards iteration encounters internal keys in the wrong order.
1732 1 : keyspan.SortKeysByTrailer(&i.span.Keys)
1733 1 :
1734 1 : if i.elideSameSeqnum && len(i.span.Keys) > 0 {
1735 1 : i.elideKeysOfSameSeqNum()
1736 1 : }
1737 1 : return &i.span
1738 : }
1739 :
1740 : // Error implements (keyspan.FragmentIterator).Error.
1741 1 : func (i *fragmentBlockIter) Error() error {
1742 1 : return i.err
1743 1 : }
1744 :
1745 : // Close implements (keyspan.FragmentIterator).Close.
1746 1 : func (i *fragmentBlockIter) Close() error {
1747 1 : var err error
1748 1 : if i.closeHook != nil {
1749 0 : err = i.closeHook(i)
1750 0 : }
1751 1 : err = firstError(err, i.blockIter.Close())
1752 1 : return err
1753 : }
1754 :
1755 : // First implements (keyspan.FragmentIterator).First
1756 1 : func (i *fragmentBlockIter) First() *keyspan.Span {
1757 1 : i.dir = +1
1758 1 : return i.gatherForward(i.blockIter.First())
1759 1 : }
1760 :
1761 : // Last implements (keyspan.FragmentIterator).Last.
1762 1 : func (i *fragmentBlockIter) Last() *keyspan.Span {
1763 1 : i.dir = -1
1764 1 : return i.gatherBackward(i.blockIter.Last())
1765 1 : }
1766 :
1767 : // Next implements (keyspan.FragmentIterator).Next.
1768 1 : func (i *fragmentBlockIter) Next() *keyspan.Span {
1769 1 : switch {
1770 1 : case i.dir == -1 && !i.span.Valid():
1771 1 : // Switching directions.
1772 1 : //
1773 1 : // i.blockIter is exhausted, before the first key. Move onto the first.
1774 1 : i.blockIter.First()
1775 1 : i.dir = +1
1776 1 : case i.dir == -1 && i.span.Valid():
1777 1 : // Switching directions.
1778 1 : //
1779 1 : // i.blockIter is currently positioned over the last internal key for
1780 1 : // the previous span. Next it once to move to the first internal key
1781 1 : // that makes up the current span, and gatherForwaad to land on the
1782 1 : // first internal key making up the next span.
1783 1 : //
1784 1 : // In the diagram below, if the last span returned to the user during
1785 1 : // reverse iteration was [b,c), i.blockIter is currently positioned at
1786 1 : // [a,b). The block iter must be positioned over [d,e) to gather the
1787 1 : // next span's fragments.
1788 1 : //
1789 1 : // ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
1790 1 : // ^ ^
1791 1 : // i.blockIter want
1792 1 : if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() {
1793 0 : panic("pebble: invariant violation: next entry unexpectedly invalid")
1794 : }
1795 1 : i.dir = +1
1796 : }
1797 : // We know that this blockIter has in-place values.
1798 1 : return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
1799 : }
1800 :
1801 : // Prev implements (keyspan.FragmentIterator).Prev.
1802 1 : func (i *fragmentBlockIter) Prev() *keyspan.Span {
1803 1 : switch {
1804 1 : case i.dir == +1 && !i.span.Valid():
1805 1 : // Switching directions.
1806 1 : //
1807 1 : // i.blockIter is exhausted, after the last key. Move onto the last.
1808 1 : i.blockIter.Last()
1809 1 : i.dir = -1
1810 1 : case i.dir == +1 && i.span.Valid():
1811 1 : // Switching directions.
1812 1 : //
1813 1 : // i.blockIter is currently positioned over the first internal key for
1814 1 : // the next span. Prev it once to move to the last internal key that
1815 1 : // makes up the current span, and gatherBackward to land on the last
1816 1 : // internal key making up the previous span.
1817 1 : //
1818 1 : // In the diagram below, if the last span returned to the user during
1819 1 : // forward iteration was [b,c), i.blockIter is currently positioned at
1820 1 : // [d,e). The block iter must be positioned over [a,b) to gather the
1821 1 : // previous span's fragments.
1822 1 : //
1823 1 : // ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
1824 1 : // ^ ^
1825 1 : // want i.blockIter
1826 1 : if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() {
1827 0 : panic("pebble: invariant violation: previous entry unexpectedly invalid")
1828 : }
1829 1 : i.dir = -1
1830 : }
1831 : // We know that this blockIter has in-place values.
1832 1 : return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
1833 : }
1834 :
1835 : // SeekGE implements (keyspan.FragmentIterator).SeekGE.
1836 1 : func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span {
1837 1 : if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 {
1838 1 : return s
1839 1 : }
1840 : // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
1841 : // doesn't meet the k < s.End comparison, then there's no need for the
1842 : // SeekLT to gatherBackward.
1843 1 : return i.Next()
1844 : }
1845 :
1846 : // SeekLT implements (keyspan.FragmentIterator).SeekLT.
1847 1 : func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span {
1848 1 : i.dir = -1
1849 1 : return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
1850 1 : }
1851 :
1852 : // String implements fmt.Stringer.
1853 0 : func (i *fragmentBlockIter) String() string {
1854 0 : return "fragment-block-iter"
1855 0 : }
1856 :
1857 : // SetCloseHook implements sstable.FragmentIterator.
1858 0 : func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
1859 0 : i.closeHook = fn
1860 0 : }
|