Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/errors"
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/manual"
17 : "golang.org/x/exp/slices"
18 : )
19 :
20 : // blockIter is an iterator over a single block of data.
21 : //
22 : // A blockIter provides an additional guarantee around key stability when a
23 : // block has a restart interval of 1 (i.e. when there is no prefix
24 : // compression). Key stability refers to whether the InternalKey.UserKey bytes
25 : // returned by a positioning call will remain stable after a subsequent
26 : // positioning call. The normal case is that a positioning call will invalidate
27 : // any previously returned InternalKey.UserKey. If a block has a restart
28 : // interval of 1 (no prefix compression), blockIter guarantees that
29 : // InternalKey.UserKey will point to the key as stored in the block itself
30 : // which will remain valid until the blockIter is closed. The key stability
31 : // guarantee is used by the range tombstone and range key code, which knows that
32 : // the respective blocks are always encoded with a restart interval of 1. This
33 : // per-block key stability guarantee is sufficient for range tombstones and
34 : // range deletes as they are always encoded in a single block. Note: this
35 : // stability guarantee no longer holds for a block iter with synthetic suffix
36 : // replacement, but this doesn't matter, as the user will not open
37 : // an iterator with a synthetic suffix on a block with rangekeys (for now).
38 : //
39 : // A blockIter also provides a value stability guarantee for range deletions and
40 : // range keys since there is only a single range deletion and range key block
41 : // per sstable and the blockIter will not release the bytes for the block until
42 : // it is closed.
43 : //
44 : // Note on why blockIter knows about lazyValueHandling:
45 : //
46 : // blockIter's positioning functions (that return a LazyValue), are too
47 : // complex to inline even prior to lazyValueHandling. blockIter.Next and
48 : // blockIter.First were by far the cheapest and had costs 195 and 180
49 : // respectively, which exceeds the budget of 80. We initially tried to keep
50 : // the lazyValueHandling logic out of blockIter by wrapping it with a
51 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
52 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
53 : // they called the corresponding blockIter func and then decided whether the
54 : // value was in fact in-place (so return immediately) or needed further
55 : // handling. But these also turned out too costly for mid-stack inlining since
56 : // simple calls like the following have a high cost that is barely under the
57 : // budget of 80
58 : //
59 : // k, v := i.data.SeekGE(key, flags) // cost 74
60 : // k, v := i.data.Next() // cost 72
61 : //
62 : // We have 2 options for minimizing performance regressions:
63 : // - Include the lazyValueHandling logic in the already non-inlineable
64 : // blockIter functions: Since most of the time is spent in data block iters,
65 : // it is acceptable to take the small hit of unnecessary branching (which
66 : // hopefully branch prediction will predict correctly) for other kinds of
67 : // blocks.
68 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
69 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
70 : // v3 sstable. We would want to manage these copies via code generation.
71 : //
72 : // We have picked the first option here.
73 : type blockIter struct {
74 : cmp Compare
75 : split Split
76 :
77 : // Iterator transforms.
78 : //
79 : // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
80 : // before the key is returned to the user. A sequence of iter operations on a
81 : // block with a syntheticSuffix rule should return keys as if those operations
82 : // ran on a block with keys that all had the syntheticSuffix. As an example:
83 : // any sequence of block iter cmds should return the same keys for the
84 : // following two blocks:
85 : //
86 : // blockA: a@3,b@3,c@3
87 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
88 : //
89 : // To ensure this, Suffix replacement will not change the ordering of keys in
90 : // the block because the iter assumes that no two keys in the block share the
91 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
92 : // iterator handles "off by one" errors (explained in more detail in those
93 : // functions) when, for a given key, originalSuffix < searchSuffix <
94 : // replacementSuffix, with integer comparison. To handle these cases, the
95 : // iterator assumes:
96 : //
97 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
98 : // for keys with a suffix.
99 : //
100 : // NB: it is possible for a block iter to add a synthetic suffix on a key
101 : // without a suffix, which implies
102 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
103 : // however, the iterator would never need to handle an off by one error in
104 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
105 : // integer comparison.
106 : //
107 : //
108 : // In addition, we also assume that any block with rangekeys will not contain
109 : // a synthetic suffix.
110 : transforms IterTransforms
111 :
112 : // offset is the byte index that marks where the current key/value is
113 : // encoded in the block.
114 : offset int32
115 : // nextOffset is the byte index where the next key/value is encoded in the
116 : // block.
117 : nextOffset int32
118 : // A "restart point" in a block is a point where the full key is encoded,
119 : // instead of just having a suffix of the key encoded. See readEntry() for
120 : // how prefix compression of keys works. Keys in between two restart points
121 : // only have a suffix encoded in the block. When restart interval is 1, no
122 : // prefix compression of keys happens. This is the case with range tombstone
123 : // blocks.
124 : //
125 : // All restart offsets are listed in increasing order in
126 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
127 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
128 : // therefore be seen as the point where data in the block ends, and a list
129 : // of offsets of all restart points begins.
130 : restarts int32
131 : // Number of restart points in this block. Encoded at the end of the block
132 : // as a uint32.
133 : numRestarts int32
134 : ptr unsafe.Pointer
135 : data []byte
136 : // key contains the raw key the iterator is currently pointed at. This may
137 : // point directly to data stored in the block (for a key which has no prefix
138 : // compression), to fullKey (for a prefix compressed key), or to a slice of
139 : // data stored in cachedBuf (during reverse iteration).
140 : //
141 : // NB: In general, key contains the same logical content as ikey
142 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
143 : // replacement rule, this will not be the case. Therefore, key should never
144 : // be used after ikey is set.
145 : key []byte
146 : // fullKey is a buffer used for key prefix decompression. Note that if
147 : // transforms.SyntheticPrifix is not nil, fullKey always starts with that
148 : // prefix.
149 : fullKey []byte
150 : // val contains the value the iterator is currently pointed at. If non-nil,
151 : // this points to a slice of the block data.
152 : val []byte
153 : // ikv contains the decoded internal KV the iterator is currently positioned
154 : // at.
155 : //
156 : // ikv.InternalKey contains the decoded InternalKey the iterator is
157 : // currently pointed at. Note that the memory backing ikv.UserKey is either
158 : // data stored directly in the block, fullKey, or cachedBuf. The key
159 : // stability guarantee for blocks built with a restart interval of 1 is
160 : // achieved by having ikv.UserKey always point to data stored directly in
161 : // the block.
162 : //
163 : // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
164 : // method returns a non-nil key-value pair.
165 : ikv base.InternalKV
166 : // cached and cachedBuf are used during reverse iteration. They are needed
167 : // because we can't perform prefix decoding in reverse, only in the forward
168 : // direction. In order to iterate in reverse, we decode and cache the entries
169 : // between two restart points.
170 : //
171 : // Note that cached[len(cached)-1] contains the previous entry to the one the
172 : // blockIter is currently pointed at. As usual, nextOffset will contain the
173 : // offset of the next entry. During reverse iteration, nextOffset will be
174 : // updated to point to offset, and we'll set the blockIter to point at the
175 : // entry cached[len(cached)-1]. See Prev() for more details.
176 : //
177 : // For a block encoded with a restart interval of 1, cached and cachedBuf
178 : // will not be used as there are no prefix compressed entries between the
179 : // restart points.
180 : cached []blockEntry
181 : cachedBuf []byte
182 : handle bufferHandle
183 : // for block iteration for already loaded blocks.
184 : firstUserKey []byte
185 : lazyValueHandling struct {
186 : vbr *valueBlockReader
187 : hasValuePrefix bool
188 : }
189 : synthSuffixBuf []byte
190 : firstUserKeyWithPrefixBuf []byte
191 : }
192 :
193 : type blockEntry struct {
194 : offset int32
195 : keyStart int32
196 : keyEnd int32
197 : valStart int32
198 : valSize int32
199 : }
200 :
201 : // blockIter implements the base.InternalIterator interface.
202 : var _ base.InternalIterator = (*blockIter)(nil)
203 :
204 : func newBlockIter(
205 : cmp Compare, split Split, block block, transforms IterTransforms,
206 2 : ) (*blockIter, error) {
207 2 : i := &blockIter{}
208 2 : return i, i.init(cmp, split, block, transforms)
209 2 : }
210 :
211 0 : func (i *blockIter) String() string {
212 0 : return "block"
213 0 : }
214 :
215 2 : func (i *blockIter) init(cmp Compare, split Split, block block, transforms IterTransforms) error {
216 2 : numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
217 2 : if numRestarts == 0 {
218 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
219 0 : }
220 2 : i.transforms = transforms
221 2 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
222 2 : i.split = split
223 2 : i.cmp = cmp
224 2 : i.restarts = int32(len(block)) - 4*(1+numRestarts)
225 2 : i.numRestarts = numRestarts
226 2 : i.ptr = unsafe.Pointer(&block[0])
227 2 : i.data = block
228 2 : if i.transforms.SyntheticPrefix.IsSet() {
229 2 : i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
230 2 : } else {
231 2 : i.fullKey = i.fullKey[:0]
232 2 : }
233 2 : i.val = nil
234 2 : i.clearCache()
235 2 : if i.restarts > 0 {
236 2 : if err := i.readFirstKey(); err != nil {
237 0 : return err
238 0 : }
239 2 : } else {
240 2 : // Block is empty.
241 2 : i.firstUserKey = nil
242 2 : }
243 2 : return nil
244 : }
245 :
246 : // NB: two cases of hideObsoletePoints:
247 : // - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
248 : // ingested.
249 : // - Foreign sstable iteration: syntheticSeqNum is always set.
250 : func (i *blockIter) initHandle(
251 : cmp Compare, split Split, block bufferHandle, transforms IterTransforms,
252 2 : ) error {
253 2 : i.handle.Release()
254 2 : i.handle = block
255 2 : return i.init(cmp, split, block.Get(), transforms)
256 2 : }
257 :
258 2 : func (i *blockIter) invalidate() {
259 2 : i.clearCache()
260 2 : i.offset = 0
261 2 : i.nextOffset = 0
262 2 : i.restarts = 0
263 2 : i.numRestarts = 0
264 2 : i.data = nil
265 2 : }
266 :
267 : // isDataInvalidated returns true when the blockIter has been invalidated
268 : // using an invalidate call. NB: this is different from blockIter.Valid
269 : // which is part of the InternalIterator implementation.
270 2 : func (i *blockIter) isDataInvalidated() bool {
271 2 : return i.data == nil
272 2 : }
273 :
274 2 : func (i *blockIter) resetForReuse() blockIter {
275 2 : return blockIter{
276 2 : fullKey: i.fullKey[:0],
277 2 : cached: i.cached[:0],
278 2 : cachedBuf: i.cachedBuf[:0],
279 2 : firstUserKeyWithPrefixBuf: i.firstUserKeyWithPrefixBuf[:0],
280 2 : data: nil,
281 2 : }
282 2 : }
283 :
284 2 : func (i *blockIter) readEntry() {
285 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
286 2 :
287 2 : // This is an ugly performance hack. Reading entries from blocks is one of
288 2 : // the inner-most routines and decoding the 3 varints per-entry takes
289 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
290 2 : // us, so we do it manually. This provides a 10-15% performance improvement
291 2 : // on blockIter benchmarks on both go1.11 and go1.12.
292 2 : //
293 2 : // TODO(peter): remove this hack if go:inline is ever supported.
294 2 :
295 2 : var shared uint32
296 2 : if a := *((*uint8)(ptr)); a < 128 {
297 2 : shared = uint32(a)
298 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
299 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
300 0 : shared = uint32(b)<<7 | uint32(a)
301 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
302 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
303 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
304 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
305 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
306 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
307 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
308 0 : } else {
309 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
310 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
311 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
312 0 : }
313 :
314 2 : var unshared uint32
315 2 : if a := *((*uint8)(ptr)); a < 128 {
316 2 : unshared = uint32(a)
317 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
318 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
319 0 : unshared = uint32(b)<<7 | uint32(a)
320 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
321 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
322 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
323 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
324 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
325 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
326 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
327 0 : } else {
328 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
329 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
330 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
331 0 : }
332 :
333 2 : var value uint32
334 2 : if a := *((*uint8)(ptr)); a < 128 {
335 2 : value = uint32(a)
336 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
337 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
338 2 : value = uint32(b)<<7 | uint32(a)
339 2 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
340 2 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
341 1 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
342 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
343 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
344 1 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
345 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
346 1 : } else {
347 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
348 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
349 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
350 0 : }
351 2 : shared += uint32(len(i.transforms.SyntheticPrefix))
352 2 : unsharedKey := getBytes(ptr, int(unshared))
353 2 : // TODO(sumeer): move this into the else block below.
354 2 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
355 2 : if shared == 0 {
356 2 : // Provide stability for the key across positioning calls if the key
357 2 : // doesn't share a prefix with the previous key. This removes requiring the
358 2 : // key to be copied if the caller knows the block has a restart interval of
359 2 : // 1. An important example of this is range-del blocks.
360 2 : i.key = unsharedKey
361 2 : } else {
362 2 : i.key = i.fullKey
363 2 : }
364 2 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
365 2 : i.val = getBytes(ptr, int(value))
366 2 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
367 : }
368 :
369 2 : func (i *blockIter) readFirstKey() error {
370 2 : ptr := i.ptr
371 2 :
372 2 : // This is an ugly performance hack. Reading entries from blocks is one of
373 2 : // the inner-most routines and decoding the 3 varints per-entry takes
374 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
375 2 : // us, so we do it manually. This provides a 10-15% performance improvement
376 2 : // on blockIter benchmarks on both go1.11 and go1.12.
377 2 : //
378 2 : // TODO(peter): remove this hack if go:inline is ever supported.
379 2 :
380 2 : if shared := *((*uint8)(ptr)); shared == 0 {
381 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
382 2 : } else {
383 0 : // The shared length is != 0, which is invalid.
384 0 : panic("first key in block must have zero shared length")
385 : }
386 :
387 2 : var unshared uint32
388 2 : if a := *((*uint8)(ptr)); a < 128 {
389 2 : unshared = uint32(a)
390 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
391 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
392 0 : unshared = uint32(b)<<7 | uint32(a)
393 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
394 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
395 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
396 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
397 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
398 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
399 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
400 0 : } else {
401 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
402 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
403 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
404 0 : }
405 :
406 : // Skip the value length.
407 2 : if a := *((*uint8)(ptr)); a < 128 {
408 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
409 2 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
410 2 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
411 2 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
412 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
413 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
414 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
415 1 : } else {
416 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
417 0 : }
418 :
419 2 : firstKey := getBytes(ptr, int(unshared))
420 2 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
421 2 : // BlockIter benchmarks.
422 2 : if n := len(firstKey) - 8; n >= 0 {
423 2 : i.firstUserKey = firstKey[:n:n]
424 2 : } else {
425 0 : i.firstUserKey = nil
426 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
427 0 : }
428 2 : if i.transforms.SyntheticPrefix != nil {
429 2 : i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
430 2 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
431 2 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
432 2 : i.firstUserKey = i.firstUserKeyWithPrefixBuf
433 2 : }
434 2 : return nil
435 : }
436 :
437 : // The sstable internal obsolete bit is set when writing a block and unset by
438 : // blockIter, so no code outside block writing/reading code ever sees it.
439 : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
440 : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
441 :
442 2 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
443 2 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
444 2 : // BlockIter benchmarks.
445 2 : if n := len(key) - 8; n >= 0 {
446 2 : trailer := binary.LittleEndian.Uint64(key[n:])
447 2 : hiddenPoint = i.transforms.HideObsoletePoints &&
448 2 : (trailer&trailerObsoleteBit != 0)
449 2 : i.ikv.K.Trailer = trailer & trailerObsoleteMask
450 2 : i.ikv.K.UserKey = key[:n:n]
451 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
452 2 : i.ikv.K.SetSeqNum(uint64(n))
453 2 : }
454 2 : } else {
455 2 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
456 2 : i.ikv.K.UserKey = nil
457 2 : }
458 2 : return hiddenPoint
459 : }
460 :
461 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
462 : // i.transforms.syntheticSuffix.
463 2 : func (i *blockIter) maybeReplaceSuffix() {
464 2 : if i.transforms.SyntheticSuffix.IsSet() && i.ikv.K.UserKey != nil {
465 2 : prefixLen := i.split(i.ikv.K.UserKey)
466 2 : // If ikey is cached or may get cached, we must copy
467 2 : // UserKey to a new buffer before suffix replacement.
468 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
469 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
470 2 : i.ikv.K.UserKey = i.synthSuffixBuf
471 2 : }
472 : }
473 :
474 2 : func (i *blockIter) clearCache() {
475 2 : i.cached = i.cached[:0]
476 2 : i.cachedBuf = i.cachedBuf[:0]
477 2 : }
478 :
479 2 : func (i *blockIter) cacheEntry() {
480 2 : var valStart int32
481 2 : valSize := int32(len(i.val))
482 2 : if valSize > 0 {
483 2 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
484 2 : }
485 :
486 2 : i.cached = append(i.cached, blockEntry{
487 2 : offset: i.offset,
488 2 : keyStart: int32(len(i.cachedBuf)),
489 2 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
490 2 : valStart: valStart,
491 2 : valSize: valSize,
492 2 : })
493 2 : i.cachedBuf = append(i.cachedBuf, i.key...)
494 : }
495 :
496 2 : func (i *blockIter) getFirstUserKey() []byte {
497 2 : return i.firstUserKey
498 2 : }
499 :
500 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
501 : // package.
502 2 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
503 2 : if invariants.Enabled && i.isDataInvalidated() {
504 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
505 : }
506 2 : searchKey := key
507 2 : if i.transforms.SyntheticPrefix != nil {
508 2 : // The seek key is before or after the entire block of keys that start with
509 2 : // SyntheticPrefix. To determine which, we need to compare against a valid
510 2 : // key in the block. We use firstUserKey which has the synthetic prefix.
511 2 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
512 1 : if i.cmp(i.firstUserKey, key) >= 0 {
513 1 : return i.First()
514 1 : }
515 : // Set the offset to the end of the block to mimic the offset of an
516 : // invalid iterator. This ensures a subsequent i.Prev() returns a valid
517 : // result.
518 1 : i.offset = i.restarts
519 1 : i.nextOffset = i.restarts
520 1 : return nil
521 : }
522 2 : searchKey = key[len(i.transforms.SyntheticPrefix):]
523 : }
524 :
525 2 : i.clearCache()
526 2 : // Find the index of the smallest restart point whose key is > the key
527 2 : // sought; index will be numRestarts if there is no such restart point.
528 2 : i.offset = 0
529 2 : var index int32
530 2 :
531 2 : {
532 2 : // NB: manually inlined sort.Seach is ~5% faster.
533 2 : //
534 2 : // Define f(-1) == false and f(n) == true.
535 2 : // Invariant: f(index-1) == false, f(upper) == true.
536 2 : upper := i.numRestarts
537 2 : for index < upper {
538 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
539 2 : // index ≤ h < upper
540 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
541 2 : // For a restart point, there are 0 bytes shared with the previous key.
542 2 : // The varint encoding of 0 occupies 1 byte.
543 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
544 2 :
545 2 : // Decode the key at that restart point, and compare it to the key
546 2 : // sought. See the comment in readEntry for why we manually inline the
547 2 : // varint decoding.
548 2 : var v1 uint32
549 2 : if a := *((*uint8)(ptr)); a < 128 {
550 2 : v1 = uint32(a)
551 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
552 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
553 0 : v1 = uint32(b)<<7 | uint32(a)
554 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
555 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
556 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
557 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
558 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
559 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
560 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
561 0 : } else {
562 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
563 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
564 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
565 0 : }
566 :
567 2 : if *((*uint8)(ptr)) < 128 {
568 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
569 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
570 2 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
571 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
572 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
573 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
574 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
575 0 : } else {
576 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
577 0 : }
578 :
579 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
580 : // speedup on BlockIter benchmarks.
581 2 : s := getBytes(ptr, int(v1))
582 2 : var k []byte
583 2 : if n := len(s) - 8; n >= 0 {
584 2 : k = s[:n:n]
585 2 : }
586 : // Else k is invalid, and left as nil
587 :
588 2 : if i.cmp(searchKey, k) > 0 {
589 2 : // The search key is greater than the user key at this restart point.
590 2 : // Search beyond this restart point, since we are trying to find the
591 2 : // first restart point with a user key >= the search key.
592 2 : index = h + 1 // preserves f(i-1) == false
593 2 : } else {
594 2 : // k >= search key, so prune everything after index (since index
595 2 : // satisfies the property we are looking for).
596 2 : upper = h // preserves f(j) == true
597 2 : }
598 : }
599 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
600 : // => answer is index.
601 : }
602 :
603 : // index is the first restart point with key >= search key. Define the keys
604 : // between a restart point and the next restart point as belonging to that
605 : // restart point.
606 : //
607 : // Since keys are strictly increasing, if index > 0 then the restart point
608 : // at index-1 will be the first one that has some keys belonging to it that
609 : // could be equal to the search key. If index == 0, then all keys in this
610 : // block are larger than the key sought, and offset remains at zero.
611 2 : if index > 0 {
612 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
613 2 : }
614 2 : i.readEntry()
615 2 : hiddenPoint := i.decodeInternalKey(i.key)
616 2 :
617 2 : // Iterate from that restart point to somewhere >= the key sought.
618 2 : if !i.valid() {
619 1 : return nil
620 1 : }
621 :
622 : // A note on seeking in a block with a suffix replacement rule: even though
623 : // the binary search above was conducted on keys without suffix replacement,
624 : // Seek will still return the correct suffix replaced key. A binary
625 : // search without suffix replacement will land on a key that is _less_ than
626 : // the key the search would have landed on if all keys were already suffix
627 : // replaced. Since Seek then conducts forward iteration to the first suffix
628 : // replaced user key that is greater than or equal to the search key, the
629 : // correct key is still returned.
630 : //
631 : // As an example, consider the following block with a restart interval of 1,
632 : // with a replacement suffix of "4":
633 : // - Pre-suffix replacement: apple@1, banana@3
634 : // - Post-suffix replacement: apple@4, banana@4
635 : //
636 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
637 : // chronological order (i.e. apple@1>apple@3), the binary search without
638 : // suffix replacement would return apple@1. A binary search with suffix
639 : // replacement would return banana@4. After beginning forward iteration from
640 : // either returned restart point, forward iteration would
641 : // always return the correct key, banana@4.
642 : //
643 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
644 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
645 : // replacement suffix), the binary search with or without suffix replacement
646 : // would land on the same key, as we assume the following:
647 : // (1) no two keys in the sst share the same prefix.
648 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
649 :
650 2 : i.maybeReplaceSuffix()
651 2 :
652 2 : if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
653 2 : // Initialize i.lazyValue
654 2 : if !i.lazyValueHandling.hasValuePrefix ||
655 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
656 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
657 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
658 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
659 2 : } else {
660 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
661 2 : }
662 2 : return &i.ikv
663 : }
664 2 : for i.Next(); i.valid(); i.Next() {
665 2 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
666 2 : // i.Next() has already initialized i.ikv.LazyValue.
667 2 : return &i.ikv
668 2 : }
669 : }
670 2 : return nil
671 : }
672 :
673 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
674 : // pebble package.
675 0 : func (i *blockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
676 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
677 0 : panic("pebble: SeekPrefixGE unimplemented")
678 : }
679 :
680 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
681 : // package.
682 2 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
683 2 : if invariants.Enabled && i.isDataInvalidated() {
684 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
685 : }
686 2 : searchKey := key
687 2 : if i.transforms.SyntheticPrefix != nil {
688 2 : // The seek key is before or after the entire block of keys that start with
689 2 : // SyntheticPrefix. To determine which, we need to compare against a valid
690 2 : // key in the block. We use firstUserKey which has the synthetic prefix.
691 2 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
692 1 : if i.cmp(i.firstUserKey, key) < 0 {
693 1 : return i.Last()
694 1 : }
695 : // Set the offset to the beginning of the block to mimic an exhausted
696 : // iterator that has conducted backward interation. This ensures a
697 : // subsequent Next() call returns the first key in the block.
698 1 : i.offset = -1
699 1 : i.nextOffset = 0
700 1 : return nil
701 : }
702 2 : searchKey = key[len(i.transforms.SyntheticPrefix):]
703 : }
704 :
705 2 : i.clearCache()
706 2 : // Find the index of the smallest restart point whose key is >= the key
707 2 : // sought; index will be numRestarts if there is no such restart point.
708 2 : i.offset = 0
709 2 : var index int32
710 2 :
711 2 : {
712 2 : // NB: manually inlined sort.Search is ~5% faster.
713 2 : //
714 2 : // Define f(-1) == false and f(n) == true.
715 2 : // Invariant: f(index-1) == false, f(upper) == true.
716 2 : upper := i.numRestarts
717 2 : for index < upper {
718 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
719 2 : // index ≤ h < upper
720 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
721 2 : // For a restart point, there are 0 bytes shared with the previous key.
722 2 : // The varint encoding of 0 occupies 1 byte.
723 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
724 2 :
725 2 : // Decode the key at that restart point, and compare it to the key
726 2 : // sought. See the comment in readEntry for why we manually inline the
727 2 : // varint decoding.
728 2 : var v1 uint32
729 2 : if a := *((*uint8)(ptr)); a < 128 {
730 2 : v1 = uint32(a)
731 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
732 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
733 0 : v1 = uint32(b)<<7 | uint32(a)
734 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
735 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
736 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
737 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
738 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
739 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
740 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
741 0 : } else {
742 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
743 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
744 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
745 0 : }
746 :
747 2 : if *((*uint8)(ptr)) < 128 {
748 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
749 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
750 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
751 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
752 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
753 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
754 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
755 0 : } else {
756 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
757 0 : }
758 :
759 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
760 : // speedup on BlockIter benchmarks.
761 2 : s := getBytes(ptr, int(v1))
762 2 : var k []byte
763 2 : if n := len(s) - 8; n >= 0 {
764 2 : k = s[:n:n]
765 2 : }
766 : // Else k is invalid, and left as nil
767 :
768 2 : if i.cmp(searchKey, k) > 0 {
769 2 : // The search key is greater than the user key at this restart point.
770 2 : // Search beyond this restart point, since we are trying to find the
771 2 : // first restart point with a user key >= the search key.
772 2 : index = h + 1 // preserves f(i-1) == false
773 2 : } else {
774 2 : // k >= search key, so prune everything after index (since index
775 2 : // satisfies the property we are looking for).
776 2 : upper = h // preserves f(j) == true
777 2 : }
778 : }
779 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
780 : // => answer is index.
781 : }
782 :
783 2 : if index == 0 {
784 2 : if i.transforms.SyntheticSuffix.IsSet() {
785 2 : // The binary search was conducted on keys without suffix replacement,
786 2 : // implying the first key in the block may be less than the search key. To
787 2 : // double check, get the first key in the block with suffix replacement
788 2 : // and compare to the search key. Consider the following example: suppose
789 2 : // the user searches with a@3, the first key in the block is a@2 and the
790 2 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
791 2 : // a@2, the binary search would return index==0. Without conducting the
792 2 : // suffix replacement, the SeekLT would incorrectly return nil. With
793 2 : // suffix replacement though, a@4 should be returned as a@4 sorts before
794 2 : // a@3.
795 2 : ikv := i.First()
796 2 : if i.cmp(ikv.K.UserKey, key) < 0 {
797 2 : return ikv
798 2 : }
799 : }
800 : // If index == 0 then all keys in this block are larger than the key
801 : // sought, so there is no match.
802 2 : i.offset = -1
803 2 : i.nextOffset = 0
804 2 : return nil
805 : }
806 :
807 : // INVARIANT: index > 0
808 :
809 : // Ignoring suffix replacement, index is the first restart point with key >=
810 : // search key. Define the keys between a restart point and the next restart
811 : // point as belonging to that restart point. Note that index could be equal to
812 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
813 : // increasing, then the restart point at index-1 will be the first one that
814 : // has some keys belonging to it that are less than the search key.
815 : //
816 : // Next, we will search between the restart at index-1 and the restart point
817 : // at index, for the first key >= key, and then on finding it, return
818 : // i.Prev(). We need to know when we have hit the offset for index, since then
819 : // we can stop searching. targetOffset encodes that offset for index.
820 2 : targetOffset := i.restarts
821 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
822 2 : if index < i.numRestarts {
823 2 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
824 2 :
825 2 : if i.transforms.SyntheticSuffix.IsSet() {
826 1 : // The binary search was conducted on keys without suffix replacement,
827 1 : // implying the returned restart point (index) may be less than the search
828 1 : // key, breaking the assumption described above.
829 1 : //
830 1 : // For example: consider this block with a replacement ts of 4, and
831 1 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
832 1 : // replacement: a@4,b@4,c@4
833 1 : //
834 1 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
835 1 : //
836 1 : // If the client calls SeekLT(b@3), the binary search would return b@2,
837 1 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
838 1 : // begin forward iteration from a@3, the previous restart point, to
839 1 : // b{suffix}. The iteration stops when it encounters a key geq to the
840 1 : // search key or if it reaches the upper bound. Without suffix
841 1 : // replacement, we can assume that the upper bound of this forward
842 1 : // iteration, b{suffix}, is greater than the search key, as implied by the
843 1 : // binary search.
844 1 : //
845 1 : // If we naively hold this assumption with suffix replacement, the
846 1 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
847 1 : // incorrectly return a@4. To correct for this, if the original returned
848 1 : // index is less than the search key, shift our forward iteration to begin
849 1 : // at index instead of index -1. With suffix replacement the key at index
850 1 : // is guaranteed to be the highest restart point less than the seach key
851 1 : // (i.e. the same property of index-1 for a block without suffix
852 1 : // replacement). This property holds because of the invariant that a block
853 1 : // with suffix replacement will not have two keys that share the same
854 1 : // prefix. To consider the above example, binary searching with b@3 landed
855 1 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
856 1 : // begin at b@4. We never need to shift by more than one restart point
857 1 : // (i.e. to c@4) because it's impossible for the search key to be greater
858 1 : // than the key at the next restart point in the block because that
859 1 : // key will always have a different prefix. Put another way, because no
860 1 : // key in the block shares the same prefix, naive binary search should
861 1 : // always land at most 1 restart point off the correct one.
862 1 :
863 1 : naiveOffset := i.offset
864 1 : // Shift up to the original binary search result and decode the key.
865 1 : i.offset = targetOffset
866 1 : i.readEntry()
867 1 : i.decodeInternalKey(i.key)
868 1 : i.maybeReplaceSuffix()
869 1 :
870 1 : // If the binary search point is actually less than the search key, post
871 1 : // replacement, bump the target offset.
872 1 : if i.cmp(i.ikv.K.UserKey, key) < 0 {
873 1 : i.offset = targetOffset
874 1 : if index+1 < i.numRestarts {
875 1 : // if index+1 is within the i.data bounds, use it to find the target
876 1 : // offset.
877 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
878 1 : } else {
879 1 : targetOffset = i.restarts
880 1 : }
881 1 : } else {
882 1 : i.offset = naiveOffset
883 1 : }
884 : }
885 : }
886 :
887 : // Init nextOffset for the forward iteration below.
888 2 : i.nextOffset = i.offset
889 2 :
890 2 : for {
891 2 : i.offset = i.nextOffset
892 2 : i.readEntry()
893 2 : // When hidden keys are common, there is additional optimization possible
894 2 : // by not caching entries that are hidden (note that some calls to
895 2 : // cacheEntry don't decode the internal key before caching, but checking
896 2 : // whether a key is hidden does not require full decoding). However, we do
897 2 : // need to use the blockEntry.offset in the cache for the first entry at
898 2 : // the reset point to do the binary search when the cache is empty -- so
899 2 : // we would need to cache that first entry (though not the key) even if
900 2 : // was hidden. Our current assumption is that if there are large numbers
901 2 : // of hidden keys we will be able to skip whole blocks (using block
902 2 : // property filters) so we don't bother optimizing.
903 2 : hiddenPoint := i.decodeInternalKey(i.key)
904 2 : i.maybeReplaceSuffix()
905 2 :
906 2 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
907 2 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
908 2 : // that we can reverse.
909 2 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
910 2 : // The current key is greater than or equal to our search key. Back up to
911 2 : // the previous key which was less than our search key. Note that this for
912 2 : // loop will execute at least once with this if-block not being true, so
913 2 : // the key we are backing up to is the last one this loop cached.
914 2 : return i.Prev()
915 2 : }
916 :
917 2 : if i.nextOffset >= targetOffset {
918 2 : // We've reached the end of the current restart block. Return the
919 2 : // current key if not hidden, else call Prev().
920 2 : //
921 2 : // When the restart interval is 1, the first iteration of the for loop
922 2 : // will bring us here. In that case ikey is backed by the block so we
923 2 : // get the desired key stability guarantee for the lifetime of the
924 2 : // blockIter. That is, we never cache anything and therefore never
925 2 : // return a key backed by cachedBuf.
926 2 : if hiddenPoint {
927 2 : return i.Prev()
928 2 : }
929 2 : break
930 : }
931 2 : i.cacheEntry()
932 : }
933 :
934 2 : if !i.valid() {
935 2 : return nil
936 2 : }
937 2 : if !i.lazyValueHandling.hasValuePrefix ||
938 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
939 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
940 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
941 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
942 2 : } else {
943 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
944 2 : }
945 2 : return &i.ikv
946 : }
947 :
948 : // First implements internalIterator.First, as documented in the pebble
949 : // package.
950 2 : func (i *blockIter) First() *base.InternalKV {
951 2 : if invariants.Enabled && i.isDataInvalidated() {
952 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
953 : }
954 :
955 2 : i.offset = 0
956 2 : if !i.valid() {
957 2 : return nil
958 2 : }
959 2 : i.clearCache()
960 2 : i.readEntry()
961 2 : hiddenPoint := i.decodeInternalKey(i.key)
962 2 : if hiddenPoint {
963 2 : return i.Next()
964 2 : }
965 2 : i.maybeReplaceSuffix()
966 2 : if !i.lazyValueHandling.hasValuePrefix ||
967 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
968 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
969 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
970 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
971 2 : } else {
972 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
973 2 : }
974 2 : return &i.ikv
975 : }
976 :
977 2 : func decodeRestart(b []byte) int32 {
978 2 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
979 2 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
980 2 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
981 2 : }
982 :
983 : // Last implements internalIterator.Last, as documented in the pebble package.
984 2 : func (i *blockIter) Last() *base.InternalKV {
985 2 : if invariants.Enabled && i.isDataInvalidated() {
986 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
987 : }
988 :
989 : // Seek forward from the last restart point.
990 2 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
991 2 : if !i.valid() {
992 2 : return nil
993 2 : }
994 :
995 2 : i.readEntry()
996 2 : i.clearCache()
997 2 :
998 2 : for i.nextOffset < i.restarts {
999 2 : i.cacheEntry()
1000 2 : i.offset = i.nextOffset
1001 2 : i.readEntry()
1002 2 : }
1003 :
1004 2 : hiddenPoint := i.decodeInternalKey(i.key)
1005 2 : if hiddenPoint {
1006 2 : return i.Prev()
1007 2 : }
1008 2 : i.maybeReplaceSuffix()
1009 2 : if !i.lazyValueHandling.hasValuePrefix ||
1010 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1011 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1012 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1013 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1014 2 : } else {
1015 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1016 2 : }
1017 2 : return &i.ikv
1018 : }
1019 :
1020 : // Next implements internalIterator.Next, as documented in the pebble
1021 : // package.
1022 2 : func (i *blockIter) Next() *base.InternalKV {
1023 2 : if len(i.cachedBuf) > 0 {
1024 2 : // We're switching from reverse iteration to forward iteration. We need to
1025 2 : // populate i.fullKey with the current key we're positioned at so that
1026 2 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1027 2 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1028 2 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1029 2 : // copying into i.fullKey works for both cases.
1030 2 : //
1031 2 : // TODO(peter): Rather than clearing the cache, we could instead use the
1032 2 : // cache until it is exhausted. This would likely be faster than falling
1033 2 : // through to the normal forward iteration code below.
1034 2 : i.fullKey = append(i.fullKey[:0], i.key...)
1035 2 : i.clearCache()
1036 2 : }
1037 :
1038 : start:
1039 2 : i.offset = i.nextOffset
1040 2 : if !i.valid() {
1041 2 : return nil
1042 2 : }
1043 2 : i.readEntry()
1044 2 : // Manually inlined version of i.decodeInternalKey(i.key).
1045 2 : if n := len(i.key) - 8; n >= 0 {
1046 2 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1047 2 : hiddenPoint := i.transforms.HideObsoletePoints &&
1048 2 : (trailer&trailerObsoleteBit != 0)
1049 2 : i.ikv.K.Trailer = trailer & trailerObsoleteMask
1050 2 : i.ikv.K.UserKey = i.key[:n:n]
1051 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1052 2 : i.ikv.K.SetSeqNum(uint64(n))
1053 2 : }
1054 2 : if hiddenPoint {
1055 2 : goto start
1056 : }
1057 2 : if i.transforms.SyntheticSuffix.IsSet() {
1058 2 : // Inlined version of i.maybeReplaceSuffix()
1059 2 : prefixLen := i.split(i.ikv.K.UserKey)
1060 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1061 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1062 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1063 2 : }
1064 0 : } else {
1065 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1066 0 : i.ikv.K.UserKey = nil
1067 0 : }
1068 2 : if !i.lazyValueHandling.hasValuePrefix ||
1069 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1070 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1071 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1072 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1073 2 : } else {
1074 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1075 2 : }
1076 2 : return &i.ikv
1077 : }
1078 :
1079 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1080 2 : func (i *blockIter) NextPrefix(succKey []byte) *base.InternalKV {
1081 2 : if i.lazyValueHandling.hasValuePrefix {
1082 2 : return i.nextPrefixV3(succKey)
1083 2 : }
1084 2 : const nextsBeforeSeek = 3
1085 2 : kv := i.Next()
1086 2 : for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
1087 2 : if j >= nextsBeforeSeek {
1088 2 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1089 2 : }
1090 2 : kv = i.Next()
1091 : }
1092 2 : return kv
1093 : }
1094 :
1095 2 : func (i *blockIter) nextPrefixV3(succKey []byte) *base.InternalKV {
1096 2 : // Doing nexts that involve a key comparison can be expensive (and the cost
1097 2 : // depends on the key length), so we use the same threshold of 3 that we use
1098 2 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1099 2 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1100 2 : // than doing a SeekGE within the block, so we do this 16 times
1101 2 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1102 2 : // search for the restart consumes > 100ns. If the number of versions is >
1103 2 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1104 2 : // on average need to find a key between two restarts, so another 8 steps
1105 2 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1106 2 : // steps.
1107 2 : //
1108 2 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1109 2 : // was written (which we don't currently store) instead of the default value
1110 2 : // of 16.
1111 2 : const nextCmpThresholdBeforeSeek = 3
1112 2 : const nextFastThresholdBeforeRestarts = 16
1113 2 : nextCmpCount := 0
1114 2 : nextFastCount := 0
1115 2 : usedRestarts := false
1116 2 : // INVARIANT: blockIter is valid.
1117 2 : if invariants.Enabled && !i.valid() {
1118 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1119 : }
1120 2 : prevKeyIsSet := i.ikv.Kind() == InternalKeyKindSet
1121 2 : for {
1122 2 : i.offset = i.nextOffset
1123 2 : if !i.valid() {
1124 2 : return nil
1125 2 : }
1126 : // Need to decode the length integers, so we can compute nextOffset.
1127 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1128 2 : // This is an ugly performance hack. Reading entries from blocks is one of
1129 2 : // the inner-most routines and decoding the 3 varints per-entry takes
1130 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1131 2 : // us, so we do it manually. This provides a 10-15% performance improvement
1132 2 : // on blockIter benchmarks on both go1.11 and go1.12.
1133 2 : //
1134 2 : // TODO(peter): remove this hack if go:inline is ever supported.
1135 2 :
1136 2 : // Decode the shared key length integer.
1137 2 : var shared uint32
1138 2 : if a := *((*uint8)(ptr)); a < 128 {
1139 2 : shared = uint32(a)
1140 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1141 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1142 0 : shared = uint32(b)<<7 | uint32(a)
1143 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1144 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1145 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1146 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1147 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1148 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1149 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1150 0 : } else {
1151 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1152 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1153 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1154 0 : }
1155 : // Decode the unshared key length integer.
1156 2 : var unshared uint32
1157 2 : if a := *((*uint8)(ptr)); a < 128 {
1158 2 : unshared = uint32(a)
1159 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1160 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1161 0 : unshared = uint32(b)<<7 | uint32(a)
1162 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1163 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1164 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1165 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1166 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1167 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1168 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1169 0 : } else {
1170 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1171 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1172 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1173 0 : }
1174 : // Decode the value length integer.
1175 2 : var value uint32
1176 2 : if a := *((*uint8)(ptr)); a < 128 {
1177 2 : value = uint32(a)
1178 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1179 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1180 0 : value = uint32(b)<<7 | uint32(a)
1181 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1182 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1183 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1184 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1185 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1186 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1187 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1188 0 : } else {
1189 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1190 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1191 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1192 0 : }
1193 2 : if i.transforms.SyntheticPrefix != nil {
1194 0 : shared += uint32(len(i.transforms.SyntheticPrefix))
1195 0 : }
1196 : // The starting position of the value.
1197 2 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1198 2 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1199 2 : if invariants.Enabled && unshared < 8 {
1200 0 : // This should not happen since only the key prefix is shared, so even
1201 0 : // if the prefix length is the same as the user key length, the unshared
1202 0 : // will include the trailer.
1203 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1204 : }
1205 : // The trailer is written in little endian, so the key kind is the first
1206 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1207 2 : keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1208 2 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1209 2 : prefixChanged := false
1210 2 : if keyKind == InternalKeyKindSet {
1211 2 : if invariants.Enabled && value == 0 {
1212 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1213 : }
1214 2 : valPrefix := *((*valuePrefix)(valuePtr))
1215 2 : if setHasSamePrefix(valPrefix) {
1216 2 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1217 2 : // that subsequent keys will not have a shared length that is greater
1218 2 : // than the prefix of the current key, which is also the prefix of
1219 2 : // i.key. Since we are continuing to iterate, we don't need to
1220 2 : // initialize i.ikey and i.lazyValue (these are initialized before
1221 2 : // returning).
1222 2 : nextFastCount++
1223 2 : if nextFastCount > nextFastThresholdBeforeRestarts {
1224 1 : if usedRestarts {
1225 0 : // Exhausted iteration budget. This will never happen unless
1226 0 : // someone is using a restart interval > 16. It is just to guard
1227 0 : // against long restart intervals causing too much iteration.
1228 0 : break
1229 : }
1230 : // Haven't used restarts yet, so find the first restart at or beyond
1231 : // the current offset.
1232 1 : targetOffset := i.offset
1233 1 : var index int32
1234 1 : {
1235 1 : // NB: manually inlined sort.Sort is ~5% faster.
1236 1 : //
1237 1 : // f defined for a restart point is true iff the offset >=
1238 1 : // targetOffset.
1239 1 : // Define f(-1) == false and f(i.numRestarts) == true.
1240 1 : // Invariant: f(index-1) == false, f(upper) == true.
1241 1 : upper := i.numRestarts
1242 1 : for index < upper {
1243 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1244 1 : // index ≤ h < upper
1245 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1246 1 : if offset < targetOffset {
1247 1 : index = h + 1 // preserves f(index-1) == false
1248 1 : } else {
1249 1 : upper = h // preserves f(upper) == true
1250 1 : }
1251 : }
1252 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1253 : // => answer is index.
1254 : }
1255 1 : usedRestarts = true
1256 1 : nextFastCount = 0
1257 1 : if index == i.numRestarts {
1258 1 : // Already past the last real restart, so iterate a bit more until
1259 1 : // we are done with the block.
1260 1 : continue
1261 : }
1262 : // Have some real restarts after index. NB: index is the first
1263 : // restart at or beyond the current offset.
1264 1 : startingIndex := index
1265 1 : for index != i.numRestarts &&
1266 1 : // The restart at index is 4 bytes written in little endian format
1267 1 : // starting at i.restart+4*index. The 0th byte is the least
1268 1 : // significant and the 3rd byte is the most significant. Since the
1269 1 : // most significant bit of the 3rd byte is what we use for
1270 1 : // encoding the set-has-same-prefix information, the indexing
1271 1 : // below has +3.
1272 1 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1273 1 : // We still have the same prefix, so move to the next restart.
1274 1 : index++
1275 1 : }
1276 : // index is the first restart that did not have the same prefix.
1277 1 : if index != startingIndex {
1278 1 : // Managed to skip past at least one restart. Resume iteration
1279 1 : // from index-1. Since nextFastCount has been reset to 0, we
1280 1 : // should be able to iterate to the next prefix.
1281 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1282 1 : i.readEntry()
1283 1 : }
1284 : // Else, unable to skip past any restart. Resume iteration. Since
1285 : // nextFastCount has been reset to 0, we should be able to iterate
1286 : // to the next prefix.
1287 1 : continue
1288 : }
1289 2 : continue
1290 2 : } else if prevKeyIsSet {
1291 2 : prefixChanged = true
1292 2 : }
1293 2 : } else {
1294 2 : prevKeyIsSet = false
1295 2 : }
1296 : // Slow-path cases:
1297 : // - (Likely) The prefix has changed.
1298 : // - (Unlikely) The prefix has not changed.
1299 : // We assemble the key etc. under the assumption that it is the likely
1300 : // case.
1301 2 : unsharedKey := getBytes(ptr, int(unshared))
1302 2 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1303 2 : // since the current logic assumes we have always copied the latest key
1304 2 : // into fullKey, which is why when we get to the next key we can (a)
1305 2 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1306 2 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1307 2 : // valid (even if unshared). For (b), we will need to remember whether
1308 2 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1309 2 : // in the former case and for the latter case need to copy the shared part
1310 2 : // too. This same comment applies to the other place where we can do this
1311 2 : // optimization, in readEntry().
1312 2 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1313 2 : i.val = getBytes(valuePtr, int(value))
1314 2 : if shared == 0 {
1315 2 : // Provide stability for the key across positioning calls if the key
1316 2 : // doesn't share a prefix with the previous key. This removes requiring the
1317 2 : // key to be copied if the caller knows the block has a restart interval of
1318 2 : // 1. An important example of this is range-del blocks.
1319 2 : i.key = unsharedKey
1320 2 : } else {
1321 2 : i.key = i.fullKey
1322 2 : }
1323 : // Manually inlined version of i.decodeInternalKey(i.key).
1324 2 : hiddenPoint := false
1325 2 : if n := len(i.key) - 8; n >= 0 {
1326 2 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1327 2 : hiddenPoint = i.transforms.HideObsoletePoints &&
1328 2 : (trailer&trailerObsoleteBit != 0)
1329 2 : i.ikv.K = base.InternalKey{
1330 2 : Trailer: trailer & trailerObsoleteMask,
1331 2 : UserKey: i.key[:n:n],
1332 2 : }
1333 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1334 1 : i.ikv.K.SetSeqNum(uint64(n))
1335 1 : }
1336 2 : if i.transforms.SyntheticSuffix.IsSet() {
1337 0 : // Inlined version of i.maybeReplaceSuffix()
1338 0 : prefixLen := i.split(i.ikv.K.UserKey)
1339 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1340 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1341 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1342 0 : }
1343 0 : } else {
1344 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1345 0 : i.ikv.K.UserKey = nil
1346 0 : }
1347 2 : nextCmpCount++
1348 2 : if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
1349 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1350 0 : i.ikv.K.UserKey, succKey))
1351 : }
1352 2 : if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
1353 2 : // Prefix has changed.
1354 2 : if hiddenPoint {
1355 1 : return i.Next()
1356 1 : }
1357 2 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1358 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1359 : }
1360 2 : if base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1361 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1362 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1363 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1364 2 : } else {
1365 0 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1366 0 : }
1367 2 : return &i.ikv
1368 : }
1369 : // Else prefix has not changed.
1370 :
1371 2 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1372 1 : break
1373 : }
1374 : }
1375 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1376 : }
1377 :
1378 : // Prev implements internalIterator.Prev, as documented in the pebble
1379 : // package.
1380 2 : func (i *blockIter) Prev() *base.InternalKV {
1381 2 : start:
1382 2 : for n := len(i.cached) - 1; n >= 0; n-- {
1383 2 : i.nextOffset = i.offset
1384 2 : e := &i.cached[n]
1385 2 : i.offset = e.offset
1386 2 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1387 2 : // Manually inlined version of i.decodeInternalKey(i.key).
1388 2 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1389 2 : if n := len(i.key) - 8; n >= 0 {
1390 2 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1391 2 : hiddenPoint := i.transforms.HideObsoletePoints &&
1392 2 : (trailer&trailerObsoleteBit != 0)
1393 2 : if hiddenPoint {
1394 2 : continue
1395 : }
1396 2 : i.ikv.K = base.InternalKey{
1397 2 : Trailer: trailer & trailerObsoleteMask,
1398 2 : UserKey: i.key[:n:n],
1399 2 : }
1400 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1401 2 : i.ikv.K.SetSeqNum(uint64(n))
1402 2 : }
1403 2 : if i.transforms.SyntheticSuffix.IsSet() {
1404 2 : // Inlined version of i.maybeReplaceSuffix()
1405 2 : prefixLen := i.split(i.ikv.K.UserKey)
1406 2 : // If ikey is cached or may get cached, we must de-reference
1407 2 : // UserKey before suffix replacement.
1408 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1409 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1410 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1411 2 : }
1412 0 : } else {
1413 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1414 0 : i.ikv.K.UserKey = nil
1415 0 : }
1416 2 : i.cached = i.cached[:n]
1417 2 : if !i.lazyValueHandling.hasValuePrefix ||
1418 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1419 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1420 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1421 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1422 2 : } else {
1423 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1424 2 : }
1425 2 : return &i.ikv
1426 : }
1427 :
1428 2 : i.clearCache()
1429 2 : if i.offset <= 0 {
1430 2 : i.offset = -1
1431 2 : i.nextOffset = 0
1432 2 : return nil
1433 2 : }
1434 :
1435 2 : targetOffset := i.offset
1436 2 : var index int32
1437 2 :
1438 2 : {
1439 2 : // NB: manually inlined sort.Sort is ~5% faster.
1440 2 : //
1441 2 : // Define f(-1) == false and f(n) == true.
1442 2 : // Invariant: f(index-1) == false, f(upper) == true.
1443 2 : upper := i.numRestarts
1444 2 : for index < upper {
1445 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1446 2 : // index ≤ h < upper
1447 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
1448 2 : if offset < targetOffset {
1449 2 : // Looking for the first restart that has offset >= targetOffset, so
1450 2 : // ignore h and earlier.
1451 2 : index = h + 1 // preserves f(i-1) == false
1452 2 : } else {
1453 2 : upper = h // preserves f(j) == true
1454 2 : }
1455 : }
1456 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1457 : // => answer is index.
1458 : }
1459 :
1460 : // index is first restart with offset >= targetOffset. Note that
1461 : // targetOffset may not be at a restart point since one can call Prev()
1462 : // after Next() (so the cache was not populated) and targetOffset refers to
1463 : // the current entry. index-1 must have an offset < targetOffset (it can't
1464 : // be equal to targetOffset since the binary search would have selected that
1465 : // as the index).
1466 2 : i.offset = 0
1467 2 : if index > 0 {
1468 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1469 2 : }
1470 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1471 : // valid offset.
1472 :
1473 2 : i.readEntry()
1474 2 :
1475 2 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1476 2 : // entry we are stepping back from, and we don't need to cache the entry
1477 2 : // before it, since it is the candidate to return.
1478 2 : for i.nextOffset < targetOffset {
1479 2 : i.cacheEntry()
1480 2 : i.offset = i.nextOffset
1481 2 : i.readEntry()
1482 2 : }
1483 :
1484 2 : hiddenPoint := i.decodeInternalKey(i.key)
1485 2 : if hiddenPoint {
1486 2 : // Use the cache.
1487 2 : goto start
1488 : }
1489 2 : if i.transforms.SyntheticSuffix.IsSet() {
1490 2 : // Inlined version of i.maybeReplaceSuffix()
1491 2 : prefixLen := i.split(i.ikv.K.UserKey)
1492 2 : // If ikey is cached or may get cached, we must de-reference
1493 2 : // UserKey before suffix replacement.
1494 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1495 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1496 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1497 2 : }
1498 2 : if !i.lazyValueHandling.hasValuePrefix ||
1499 2 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1500 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1501 2 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1502 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1503 2 : } else {
1504 2 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1505 2 : }
1506 2 : return &i.ikv
1507 : }
1508 :
1509 : // Key returns the internal key at the current iterator position.
1510 2 : func (i *blockIter) Key() *InternalKey {
1511 2 : return &i.ikv.K
1512 2 : }
1513 :
1514 : // KV returns the internal KV at the current iterator position.
1515 2 : func (i *blockIter) KV() *base.InternalKV {
1516 2 : return &i.ikv
1517 2 : }
1518 :
1519 2 : func (i *blockIter) value() base.LazyValue {
1520 2 : return i.ikv.V
1521 2 : }
1522 :
1523 : // Error implements internalIterator.Error, as documented in the pebble
1524 : // package.
1525 2 : func (i *blockIter) Error() error {
1526 2 : return nil // infallible
1527 2 : }
1528 :
1529 : // Close implements internalIterator.Close, as documented in the pebble
1530 : // package.
1531 2 : func (i *blockIter) Close() error {
1532 2 : i.handle.Release()
1533 2 : i.handle = bufferHandle{}
1534 2 : i.val = nil
1535 2 : i.ikv = base.InternalKV{}
1536 2 : i.lazyValueHandling.vbr = nil
1537 2 : return nil
1538 2 : }
1539 :
1540 0 : func (i *blockIter) SetBounds(lower, upper []byte) {
1541 0 : // This should never be called as bounds are handled by sstable.Iterator.
1542 0 : panic("pebble: SetBounds unimplemented")
1543 : }
1544 :
1545 0 : func (i *blockIter) SetContext(_ context.Context) {}
1546 :
1547 2 : func (i *blockIter) valid() bool {
1548 2 : return i.offset >= 0 && i.offset < i.restarts
1549 2 : }
|