Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/errors"
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/manual"
17 : "golang.org/x/exp/slices"
18 : )
19 :
20 : // blockIter is an iterator over a single block of data.
21 : //
22 : // A blockIter provides an additional guarantee around key stability when a
23 : // block has a restart interval of 1 (i.e. when there is no prefix
24 : // compression). Key stability refers to whether the InternalKey.UserKey bytes
25 : // returned by a positioning call will remain stable after a subsequent
26 : // positioning call. The normal case is that a positioning call will invalidate
27 : // any previously returned InternalKey.UserKey. If a block has a restart
28 : // interval of 1 (no prefix compression), blockIter guarantees that
29 : // InternalKey.UserKey will point to the key as stored in the block itself
30 : // which will remain valid until the blockIter is closed. The key stability
31 : // guarantee is used by the range tombstone and range key code, which knows that
32 : // the respective blocks are always encoded with a restart interval of 1. This
33 : // per-block key stability guarantee is sufficient for range tombstones and
34 : // range deletes as they are always encoded in a single block. Note: this
35 : // stability guarantee no longer holds for a block iter with synthetic suffix
36 : // replacement, but this doesn't matter, as the user will not open
37 : // an iterator with a synthetic suffix on a block with rangekeys (for now).
38 : //
39 : // A blockIter also provides a value stability guarantee for range deletions and
40 : // range keys since there is only a single range deletion and range key block
41 : // per sstable and the blockIter will not release the bytes for the block until
42 : // it is closed.
43 : //
44 : // Note on why blockIter knows about lazyValueHandling:
45 : //
46 : // blockIter's positioning functions (that return a LazyValue), are too
47 : // complex to inline even prior to lazyValueHandling. blockIter.Next and
48 : // blockIter.First were by far the cheapest and had costs 195 and 180
49 : // respectively, which exceeds the budget of 80. We initially tried to keep
50 : // the lazyValueHandling logic out of blockIter by wrapping it with a
51 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
52 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
53 : // they called the corresponding blockIter func and then decided whether the
54 : // value was in fact in-place (so return immediately) or needed further
55 : // handling. But these also turned out too costly for mid-stack inlining since
56 : // simple calls like the following have a high cost that is barely under the
57 : // budget of 80
58 : //
59 : // k, v := i.data.SeekGE(key, flags) // cost 74
60 : // k, v := i.data.Next() // cost 72
61 : //
62 : // We have 2 options for minimizing performance regressions:
63 : // - Include the lazyValueHandling logic in the already non-inlineable
64 : // blockIter functions: Since most of the time is spent in data block iters,
65 : // it is acceptable to take the small hit of unnecessary branching (which
66 : // hopefully branch prediction will predict correctly) for other kinds of
67 : // blocks.
68 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
69 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
70 : // v3 sstable. We would want to manage these copies via code generation.
71 : //
72 : // We have picked the first option here.
73 : type blockIter struct {
74 : cmp Compare
75 : split Split
76 :
77 : // Iterator transforms.
78 : //
79 : // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
80 : // before the key is returned to the user. A sequence of iter operations on a
81 : // block with a syntheticSuffix rule should return keys as if those operations
82 : // ran on a block with keys that all had the syntheticSuffix. As an example:
83 : // any sequence of block iter cmds should return the same keys for the
84 : // following two blocks:
85 : //
86 : // blockA: a@3,b@3,c@3
87 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
88 : //
89 : // To ensure this, Suffix replacement will not change the ordering of keys in
90 : // the block because the iter assumes that no two keys in the block share the
91 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
92 : // iterator handles "off by one" errors (explained in more detail in those
93 : // functions) when, for a given key, originalSuffix < searchSuffix <
94 : // replacementSuffix, with integer comparison. To handle these cases, the
95 : // iterator assumes:
96 : //
97 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
98 : // for keys with a suffix.
99 : //
100 : // NB: it is possible for a block iter to add a synthetic suffix on a key
101 : // without a suffix, which implies
102 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
103 : // however, the iterator would never need to handle an off by one error in
104 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
105 : // integer comparison.
106 : //
107 : //
108 : // In addition, we also assume that any block with rangekeys will not contain
109 : // a synthetic suffix.
110 : transforms IterTransforms
111 :
112 : // offset is the byte index that marks where the current key/value is
113 : // encoded in the block.
114 : offset int32
115 : // nextOffset is the byte index where the next key/value is encoded in the
116 : // block.
117 : nextOffset int32
118 : // A "restart point" in a block is a point where the full key is encoded,
119 : // instead of just having a suffix of the key encoded. See readEntry() for
120 : // how prefix compression of keys works. Keys in between two restart points
121 : // only have a suffix encoded in the block. When restart interval is 1, no
122 : // prefix compression of keys happens. This is the case with range tombstone
123 : // blocks.
124 : //
125 : // All restart offsets are listed in increasing order in
126 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
127 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
128 : // therefore be seen as the point where data in the block ends, and a list
129 : // of offsets of all restart points begins.
130 : restarts int32
131 : // Number of restart points in this block. Encoded at the end of the block
132 : // as a uint32.
133 : numRestarts int32
134 : ptr unsafe.Pointer
135 : data []byte
136 : // key contains the raw key the iterator is currently pointed at. This may
137 : // point directly to data stored in the block (for a key which has no prefix
138 : // compression), to fullKey (for a prefix compressed key), or to a slice of
139 : // data stored in cachedBuf (during reverse iteration).
140 : //
141 : // NB: In general, key contains the same logical content as ikey
142 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
143 : // replacement rule, this will not be the case. Therefore, key should never
144 : // be used after ikey is set.
145 : key []byte
146 : // fullKey is a buffer used for key prefix decompression. Note that if
147 : // transforms.SyntheticPrifix is not nil, fullKey always starts with that
148 : // prefix.
149 : fullKey []byte
150 : // val contains the value the iterator is currently pointed at. If non-nil,
151 : // this points to a slice of the block data.
152 : val []byte
153 : // ikv contains the decoded internal KV the iterator is currently positioned
154 : // at.
155 : //
156 : // ikv.InternalKey contains the decoded InternalKey the iterator is
157 : // currently pointed at. Note that the memory backing ikv.UserKey is either
158 : // data stored directly in the block, fullKey, or cachedBuf. The key
159 : // stability guarantee for blocks built with a restart interval of 1 is
160 : // achieved by having ikv.UserKey always point to data stored directly in
161 : // the block.
162 : //
163 : // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
164 : // method returns a non-nil key-value pair.
165 : ikv base.InternalKV
166 : // cached and cachedBuf are used during reverse iteration. They are needed
167 : // because we can't perform prefix decoding in reverse, only in the forward
168 : // direction. In order to iterate in reverse, we decode and cache the entries
169 : // between two restart points.
170 : //
171 : // Note that cached[len(cached)-1] contains the previous entry to the one the
172 : // blockIter is currently pointed at. As usual, nextOffset will contain the
173 : // offset of the next entry. During reverse iteration, nextOffset will be
174 : // updated to point to offset, and we'll set the blockIter to point at the
175 : // entry cached[len(cached)-1]. See Prev() for more details.
176 : //
177 : // For a block encoded with a restart interval of 1, cached and cachedBuf
178 : // will not be used as there are no prefix compressed entries between the
179 : // restart points.
180 : cached []blockEntry
181 : cachedBuf []byte
182 : handle bufferHandle
183 : // for block iteration for already loaded blocks.
184 : firstUserKey []byte
185 : lazyValueHandling struct {
186 : vbr *valueBlockReader
187 : hasValuePrefix bool
188 : }
189 : synthSuffixBuf []byte
190 : firstUserKeyWithPrefixBuf []byte
191 : }
192 :
193 : type blockEntry struct {
194 : offset int32
195 : keyStart int32
196 : keyEnd int32
197 : valStart int32
198 : valSize int32
199 : }
200 :
201 : // blockIter implements the base.InternalIterator interface.
202 : var _ base.InternalIterator = (*blockIter)(nil)
203 :
204 : func newBlockIter(
205 : cmp Compare, split Split, block block, transforms IterTransforms,
206 1 : ) (*blockIter, error) {
207 1 : i := &blockIter{}
208 1 : return i, i.init(cmp, split, block, transforms)
209 1 : }
210 :
211 0 : func (i *blockIter) String() string {
212 0 : return "block"
213 0 : }
214 :
215 1 : func (i *blockIter) init(cmp Compare, split Split, block block, transforms IterTransforms) error {
216 1 : numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
217 1 : if numRestarts == 0 {
218 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
219 0 : }
220 1 : i.transforms = transforms
221 1 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
222 1 : i.split = split
223 1 : i.cmp = cmp
224 1 : i.restarts = int32(len(block)) - 4*(1+numRestarts)
225 1 : i.numRestarts = numRestarts
226 1 : i.ptr = unsafe.Pointer(&block[0])
227 1 : i.data = block
228 1 : if i.transforms.SyntheticPrefix.IsSet() {
229 1 : i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
230 1 : } else {
231 1 : i.fullKey = i.fullKey[:0]
232 1 : }
233 1 : i.val = nil
234 1 : i.clearCache()
235 1 : if i.restarts > 0 {
236 1 : if err := i.readFirstKey(); err != nil {
237 0 : return err
238 0 : }
239 1 : } else {
240 1 : // Block is empty.
241 1 : i.firstUserKey = nil
242 1 : }
243 1 : return nil
244 : }
245 :
246 : // NB: two cases of hideObsoletePoints:
247 : // - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
248 : // ingested.
249 : // - Foreign sstable iteration: syntheticSeqNum is always set.
250 : func (i *blockIter) initHandle(
251 : cmp Compare, split Split, block bufferHandle, transforms IterTransforms,
252 1 : ) error {
253 1 : i.handle.Release()
254 1 : i.handle = block
255 1 : return i.init(cmp, split, block.Get(), transforms)
256 1 : }
257 :
258 1 : func (i *blockIter) invalidate() {
259 1 : i.clearCache()
260 1 : i.offset = 0
261 1 : i.nextOffset = 0
262 1 : i.restarts = 0
263 1 : i.numRestarts = 0
264 1 : i.data = nil
265 1 : }
266 :
267 : // isDataInvalidated returns true when the blockIter has been invalidated
268 : // using an invalidate call. NB: this is different from blockIter.Valid
269 : // which is part of the InternalIterator implementation.
270 1 : func (i *blockIter) isDataInvalidated() bool {
271 1 : return i.data == nil
272 1 : }
273 :
274 1 : func (i *blockIter) resetForReuse() blockIter {
275 1 : return blockIter{
276 1 : fullKey: i.fullKey[:0],
277 1 : cached: i.cached[:0],
278 1 : cachedBuf: i.cachedBuf[:0],
279 1 : firstUserKeyWithPrefixBuf: i.firstUserKeyWithPrefixBuf[:0],
280 1 : data: nil,
281 1 : }
282 1 : }
283 :
284 1 : func (i *blockIter) readEntry() {
285 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
286 1 :
287 1 : // This is an ugly performance hack. Reading entries from blocks is one of
288 1 : // the inner-most routines and decoding the 3 varints per-entry takes
289 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
290 1 : // us, so we do it manually. This provides a 10-15% performance improvement
291 1 : // on blockIter benchmarks on both go1.11 and go1.12.
292 1 : //
293 1 : // TODO(peter): remove this hack if go:inline is ever supported.
294 1 :
295 1 : var shared uint32
296 1 : if a := *((*uint8)(ptr)); a < 128 {
297 1 : shared = uint32(a)
298 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
299 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
300 0 : shared = uint32(b)<<7 | uint32(a)
301 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
302 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
303 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
304 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
305 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
306 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
307 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
308 0 : } else {
309 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
310 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
311 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
312 0 : }
313 :
314 1 : var unshared uint32
315 1 : if a := *((*uint8)(ptr)); a < 128 {
316 1 : unshared = uint32(a)
317 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
318 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
319 0 : unshared = uint32(b)<<7 | uint32(a)
320 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
321 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
322 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
323 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
324 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
325 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
326 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
327 0 : } else {
328 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
329 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
330 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
331 0 : }
332 :
333 1 : var value uint32
334 1 : if a := *((*uint8)(ptr)); a < 128 {
335 1 : value = uint32(a)
336 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
337 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
338 1 : value = uint32(b)<<7 | uint32(a)
339 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
340 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
341 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
342 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
343 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
344 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
345 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
346 0 : } else {
347 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
348 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
349 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
350 0 : }
351 1 : shared += uint32(len(i.transforms.SyntheticPrefix))
352 1 : unsharedKey := getBytes(ptr, int(unshared))
353 1 : // TODO(sumeer): move this into the else block below.
354 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
355 1 : if shared == 0 {
356 1 : // Provide stability for the key across positioning calls if the key
357 1 : // doesn't share a prefix with the previous key. This removes requiring the
358 1 : // key to be copied if the caller knows the block has a restart interval of
359 1 : // 1. An important example of this is range-del blocks.
360 1 : i.key = unsharedKey
361 1 : } else {
362 1 : i.key = i.fullKey
363 1 : }
364 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
365 1 : i.val = getBytes(ptr, int(value))
366 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
367 : }
368 :
369 1 : func (i *blockIter) readFirstKey() error {
370 1 : ptr := i.ptr
371 1 :
372 1 : // This is an ugly performance hack. Reading entries from blocks is one of
373 1 : // the inner-most routines and decoding the 3 varints per-entry takes
374 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
375 1 : // us, so we do it manually. This provides a 10-15% performance improvement
376 1 : // on blockIter benchmarks on both go1.11 and go1.12.
377 1 : //
378 1 : // TODO(peter): remove this hack if go:inline is ever supported.
379 1 :
380 1 : if shared := *((*uint8)(ptr)); shared == 0 {
381 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
382 1 : } else {
383 0 : // The shared length is != 0, which is invalid.
384 0 : panic("first key in block must have zero shared length")
385 : }
386 :
387 1 : var unshared uint32
388 1 : if a := *((*uint8)(ptr)); a < 128 {
389 1 : unshared = uint32(a)
390 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
391 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
392 0 : unshared = uint32(b)<<7 | uint32(a)
393 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
394 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
395 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
396 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
397 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
398 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
399 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
400 0 : } else {
401 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
402 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
403 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
404 0 : }
405 :
406 : // Skip the value length.
407 1 : if a := *((*uint8)(ptr)); a < 128 {
408 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
409 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
410 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
411 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
412 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
413 0 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
414 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
415 0 : } else {
416 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
417 0 : }
418 :
419 1 : firstKey := getBytes(ptr, int(unshared))
420 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
421 1 : // BlockIter benchmarks.
422 1 : if n := len(firstKey) - 8; n >= 0 {
423 1 : i.firstUserKey = firstKey[:n:n]
424 1 : } else {
425 0 : i.firstUserKey = nil
426 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
427 0 : }
428 1 : if i.transforms.SyntheticPrefix != nil {
429 1 : i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
430 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
431 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
432 1 : i.firstUserKey = i.firstUserKeyWithPrefixBuf
433 1 : }
434 1 : return nil
435 : }
436 :
437 : // The sstable internal obsolete bit is set when writing a block and unset by
438 : // blockIter, so no code outside block writing/reading code ever sees it.
439 : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
440 : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
441 :
442 1 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
443 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
444 1 : // BlockIter benchmarks.
445 1 : if n := len(key) - 8; n >= 0 {
446 1 : trailer := binary.LittleEndian.Uint64(key[n:])
447 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
448 1 : (trailer&trailerObsoleteBit != 0)
449 1 : i.ikv.K.Trailer = trailer & trailerObsoleteMask
450 1 : i.ikv.K.UserKey = key[:n:n]
451 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
452 1 : i.ikv.K.SetSeqNum(uint64(n))
453 1 : }
454 1 : } else {
455 1 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
456 1 : i.ikv.K.UserKey = nil
457 1 : }
458 1 : return hiddenPoint
459 : }
460 :
461 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
462 : // i.transforms.syntheticSuffix.
463 1 : func (i *blockIter) maybeReplaceSuffix() {
464 1 : if i.transforms.SyntheticSuffix.IsSet() && i.ikv.K.UserKey != nil {
465 1 : prefixLen := i.split(i.ikv.K.UserKey)
466 1 : // If ikey is cached or may get cached, we must copy
467 1 : // UserKey to a new buffer before suffix replacement.
468 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
469 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
470 1 : i.ikv.K.UserKey = i.synthSuffixBuf
471 1 : }
472 : }
473 :
474 1 : func (i *blockIter) clearCache() {
475 1 : i.cached = i.cached[:0]
476 1 : i.cachedBuf = i.cachedBuf[:0]
477 1 : }
478 :
479 1 : func (i *blockIter) cacheEntry() {
480 1 : var valStart int32
481 1 : valSize := int32(len(i.val))
482 1 : if valSize > 0 {
483 1 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
484 1 : }
485 :
486 1 : i.cached = append(i.cached, blockEntry{
487 1 : offset: i.offset,
488 1 : keyStart: int32(len(i.cachedBuf)),
489 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
490 1 : valStart: valStart,
491 1 : valSize: valSize,
492 1 : })
493 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
494 : }
495 :
496 1 : func (i *blockIter) getFirstUserKey() []byte {
497 1 : return i.firstUserKey
498 1 : }
499 :
500 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
501 : // package.
502 1 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
503 1 : if invariants.Enabled && i.isDataInvalidated() {
504 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
505 : }
506 1 : searchKey := key
507 1 : if i.transforms.SyntheticPrefix != nil {
508 1 : // The seek key is before or after the entire block of keys that start with
509 1 : // SyntheticPrefix. To determine which, we need to compare against a valid
510 1 : // key in the block. We use firstUserKey which has the synthetic prefix.
511 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
512 0 : if i.cmp(i.firstUserKey, key) >= 0 {
513 0 : return i.First()
514 0 : }
515 : // Set the offset to the end of the block to mimic the offset of an
516 : // invalid iterator. This ensures a subsequent i.Prev() returns a valid
517 : // result.
518 0 : i.offset = i.restarts
519 0 : i.nextOffset = i.restarts
520 0 : return nil
521 : }
522 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
523 : }
524 :
525 1 : i.clearCache()
526 1 : // Find the index of the smallest restart point whose key is > the key
527 1 : // sought; index will be numRestarts if there is no such restart point.
528 1 : i.offset = 0
529 1 : var index int32
530 1 :
531 1 : {
532 1 : // NB: manually inlined sort.Seach is ~5% faster.
533 1 : //
534 1 : // Define f(-1) == false and f(n) == true.
535 1 : // Invariant: f(index-1) == false, f(upper) == true.
536 1 : upper := i.numRestarts
537 1 : for index < upper {
538 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
539 1 : // index ≤ h < upper
540 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
541 1 : // For a restart point, there are 0 bytes shared with the previous key.
542 1 : // The varint encoding of 0 occupies 1 byte.
543 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
544 1 :
545 1 : // Decode the key at that restart point, and compare it to the key
546 1 : // sought. See the comment in readEntry for why we manually inline the
547 1 : // varint decoding.
548 1 : var v1 uint32
549 1 : if a := *((*uint8)(ptr)); a < 128 {
550 1 : v1 = uint32(a)
551 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
552 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
553 0 : v1 = uint32(b)<<7 | uint32(a)
554 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
555 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
556 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
557 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
558 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
559 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
560 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
561 0 : } else {
562 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
563 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
564 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
565 0 : }
566 :
567 1 : if *((*uint8)(ptr)) < 128 {
568 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
569 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
570 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
571 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
572 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
573 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
574 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
575 0 : } else {
576 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
577 0 : }
578 :
579 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
580 : // speedup on BlockIter benchmarks.
581 1 : s := getBytes(ptr, int(v1))
582 1 : var k []byte
583 1 : if n := len(s) - 8; n >= 0 {
584 1 : k = s[:n:n]
585 1 : }
586 : // Else k is invalid, and left as nil
587 :
588 1 : if i.cmp(searchKey, k) > 0 {
589 1 : // The search key is greater than the user key at this restart point.
590 1 : // Search beyond this restart point, since we are trying to find the
591 1 : // first restart point with a user key >= the search key.
592 1 : index = h + 1 // preserves f(i-1) == false
593 1 : } else {
594 1 : // k >= search key, so prune everything after index (since index
595 1 : // satisfies the property we are looking for).
596 1 : upper = h // preserves f(j) == true
597 1 : }
598 : }
599 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
600 : // => answer is index.
601 : }
602 :
603 : // index is the first restart point with key >= search key. Define the keys
604 : // between a restart point and the next restart point as belonging to that
605 : // restart point.
606 : //
607 : // Since keys are strictly increasing, if index > 0 then the restart point
608 : // at index-1 will be the first one that has some keys belonging to it that
609 : // could be equal to the search key. If index == 0, then all keys in this
610 : // block are larger than the key sought, and offset remains at zero.
611 1 : if index > 0 {
612 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
613 1 : }
614 1 : i.readEntry()
615 1 : hiddenPoint := i.decodeInternalKey(i.key)
616 1 :
617 1 : // Iterate from that restart point to somewhere >= the key sought.
618 1 : if !i.valid() {
619 0 : return nil
620 0 : }
621 :
622 : // A note on seeking in a block with a suffix replacement rule: even though
623 : // the binary search above was conducted on keys without suffix replacement,
624 : // Seek will still return the correct suffix replaced key. A binary
625 : // search without suffix replacement will land on a key that is _less_ than
626 : // the key the search would have landed on if all keys were already suffix
627 : // replaced. Since Seek then conducts forward iteration to the first suffix
628 : // replaced user key that is greater than or equal to the search key, the
629 : // correct key is still returned.
630 : //
631 : // As an example, consider the following block with a restart interval of 1,
632 : // with a replacement suffix of "4":
633 : // - Pre-suffix replacement: apple@1, banana@3
634 : // - Post-suffix replacement: apple@4, banana@4
635 : //
636 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
637 : // chronological order (i.e. apple@1>apple@3), the binary search without
638 : // suffix replacement would return apple@1. A binary search with suffix
639 : // replacement would return banana@4. After beginning forward iteration from
640 : // either returned restart point, forward iteration would
641 : // always return the correct key, banana@4.
642 : //
643 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
644 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
645 : // replacement suffix), the binary search with or without suffix replacement
646 : // would land on the same key, as we assume the following:
647 : // (1) no two keys in the sst share the same prefix.
648 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
649 :
650 1 : i.maybeReplaceSuffix()
651 1 :
652 1 : if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
653 1 : // Initialize i.lazyValue
654 1 : if !i.lazyValueHandling.hasValuePrefix ||
655 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
656 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
657 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
658 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
659 1 : } else {
660 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
661 1 : }
662 1 : return &i.ikv
663 : }
664 1 : for i.Next(); i.valid(); i.Next() {
665 1 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
666 1 : // i.Next() has already initialized i.ikv.LazyValue.
667 1 : return &i.ikv
668 1 : }
669 : }
670 1 : return nil
671 : }
672 :
673 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
674 : // pebble package.
675 0 : func (i *blockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
676 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
677 0 : panic("pebble: SeekPrefixGE unimplemented")
678 : }
679 :
680 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
681 : // package.
682 1 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
683 1 : if invariants.Enabled && i.isDataInvalidated() {
684 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
685 : }
686 1 : searchKey := key
687 1 : if i.transforms.SyntheticPrefix != nil {
688 1 : // The seek key is before or after the entire block of keys that start with
689 1 : // SyntheticPrefix. To determine which, we need to compare against a valid
690 1 : // key in the block. We use firstUserKey which has the synthetic prefix.
691 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
692 0 : if i.cmp(i.firstUserKey, key) < 0 {
693 0 : return i.Last()
694 0 : }
695 : // Set the offset to the beginning of the block to mimic an exhausted
696 : // iterator that has conducted backward interation. This ensures a
697 : // subsequent Next() call returns the first key in the block.
698 0 : i.offset = -1
699 0 : i.nextOffset = 0
700 0 : return nil
701 : }
702 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
703 : }
704 :
705 1 : i.clearCache()
706 1 : // Find the index of the smallest restart point whose key is >= the key
707 1 : // sought; index will be numRestarts if there is no such restart point.
708 1 : i.offset = 0
709 1 : var index int32
710 1 :
711 1 : {
712 1 : // NB: manually inlined sort.Search is ~5% faster.
713 1 : //
714 1 : // Define f(-1) == false and f(n) == true.
715 1 : // Invariant: f(index-1) == false, f(upper) == true.
716 1 : upper := i.numRestarts
717 1 : for index < upper {
718 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
719 1 : // index ≤ h < upper
720 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
721 1 : // For a restart point, there are 0 bytes shared with the previous key.
722 1 : // The varint encoding of 0 occupies 1 byte.
723 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
724 1 :
725 1 : // Decode the key at that restart point, and compare it to the key
726 1 : // sought. See the comment in readEntry for why we manually inline the
727 1 : // varint decoding.
728 1 : var v1 uint32
729 1 : if a := *((*uint8)(ptr)); a < 128 {
730 1 : v1 = uint32(a)
731 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
732 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
733 0 : v1 = uint32(b)<<7 | uint32(a)
734 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
735 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
736 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
737 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
738 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
739 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
740 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
741 0 : } else {
742 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
743 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
744 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
745 0 : }
746 :
747 1 : if *((*uint8)(ptr)) < 128 {
748 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
749 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
750 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
751 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
752 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
753 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
754 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
755 0 : } else {
756 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
757 0 : }
758 :
759 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
760 : // speedup on BlockIter benchmarks.
761 1 : s := getBytes(ptr, int(v1))
762 1 : var k []byte
763 1 : if n := len(s) - 8; n >= 0 {
764 1 : k = s[:n:n]
765 1 : }
766 : // Else k is invalid, and left as nil
767 :
768 1 : if i.cmp(searchKey, k) > 0 {
769 1 : // The search key is greater than the user key at this restart point.
770 1 : // Search beyond this restart point, since we are trying to find the
771 1 : // first restart point with a user key >= the search key.
772 1 : index = h + 1 // preserves f(i-1) == false
773 1 : } else {
774 1 : // k >= search key, so prune everything after index (since index
775 1 : // satisfies the property we are looking for).
776 1 : upper = h // preserves f(j) == true
777 1 : }
778 : }
779 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
780 : // => answer is index.
781 : }
782 :
783 1 : if index == 0 {
784 1 : if i.transforms.SyntheticSuffix.IsSet() {
785 1 : // The binary search was conducted on keys without suffix replacement,
786 1 : // implying the first key in the block may be less than the search key. To
787 1 : // double check, get the first key in the block with suffix replacement
788 1 : // and compare to the search key. Consider the following example: suppose
789 1 : // the user searches with a@3, the first key in the block is a@2 and the
790 1 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
791 1 : // a@2, the binary search would return index==0. Without conducting the
792 1 : // suffix replacement, the SeekLT would incorrectly return nil. With
793 1 : // suffix replacement though, a@4 should be returned as a@4 sorts before
794 1 : // a@3.
795 1 : ikv := i.First()
796 1 : if i.cmp(ikv.K.UserKey, key) < 0 {
797 1 : return ikv
798 1 : }
799 : }
800 : // If index == 0 then all keys in this block are larger than the key
801 : // sought, so there is no match.
802 1 : i.offset = -1
803 1 : i.nextOffset = 0
804 1 : return nil
805 : }
806 :
807 : // INVARIANT: index > 0
808 :
809 : // Ignoring suffix replacement, index is the first restart point with key >=
810 : // search key. Define the keys between a restart point and the next restart
811 : // point as belonging to that restart point. Note that index could be equal to
812 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
813 : // increasing, then the restart point at index-1 will be the first one that
814 : // has some keys belonging to it that are less than the search key.
815 : //
816 : // Next, we will search between the restart at index-1 and the restart point
817 : // at index, for the first key >= key, and then on finding it, return
818 : // i.Prev(). We need to know when we have hit the offset for index, since then
819 : // we can stop searching. targetOffset encodes that offset for index.
820 1 : targetOffset := i.restarts
821 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
822 1 : if index < i.numRestarts {
823 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
824 1 :
825 1 : if i.transforms.SyntheticSuffix.IsSet() {
826 0 : // The binary search was conducted on keys without suffix replacement,
827 0 : // implying the returned restart point (index) may be less than the search
828 0 : // key, breaking the assumption described above.
829 0 : //
830 0 : // For example: consider this block with a replacement ts of 4, and
831 0 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
832 0 : // replacement: a@4,b@4,c@4
833 0 : //
834 0 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
835 0 : //
836 0 : // If the client calls SeekLT(b@3), the binary search would return b@2,
837 0 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
838 0 : // begin forward iteration from a@3, the previous restart point, to
839 0 : // b{suffix}. The iteration stops when it encounters a key geq to the
840 0 : // search key or if it reaches the upper bound. Without suffix
841 0 : // replacement, we can assume that the upper bound of this forward
842 0 : // iteration, b{suffix}, is greater than the search key, as implied by the
843 0 : // binary search.
844 0 : //
845 0 : // If we naively hold this assumption with suffix replacement, the
846 0 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
847 0 : // incorrectly return a@4. To correct for this, if the original returned
848 0 : // index is less than the search key, shift our forward iteration to begin
849 0 : // at index instead of index -1. With suffix replacement the key at index
850 0 : // is guaranteed to be the highest restart point less than the seach key
851 0 : // (i.e. the same property of index-1 for a block without suffix
852 0 : // replacement). This property holds because of the invariant that a block
853 0 : // with suffix replacement will not have two keys that share the same
854 0 : // prefix. To consider the above example, binary searching with b@3 landed
855 0 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
856 0 : // begin at b@4. We never need to shift by more than one restart point
857 0 : // (i.e. to c@4) because it's impossible for the search key to be greater
858 0 : // than the key at the next restart point in the block because that
859 0 : // key will always have a different prefix. Put another way, because no
860 0 : // key in the block shares the same prefix, naive binary search should
861 0 : // always land at most 1 restart point off the correct one.
862 0 :
863 0 : naiveOffset := i.offset
864 0 : // Shift up to the original binary search result and decode the key.
865 0 : i.offset = targetOffset
866 0 : i.readEntry()
867 0 : i.decodeInternalKey(i.key)
868 0 : i.maybeReplaceSuffix()
869 0 :
870 0 : // If the binary search point is actually less than the search key, post
871 0 : // replacement, bump the target offset.
872 0 : if i.cmp(i.ikv.K.UserKey, key) < 0 {
873 0 : i.offset = targetOffset
874 0 : if index+1 < i.numRestarts {
875 0 : // if index+1 is within the i.data bounds, use it to find the target
876 0 : // offset.
877 0 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
878 0 : } else {
879 0 : targetOffset = i.restarts
880 0 : }
881 0 : } else {
882 0 : i.offset = naiveOffset
883 0 : }
884 : }
885 : }
886 :
887 : // Init nextOffset for the forward iteration below.
888 1 : i.nextOffset = i.offset
889 1 :
890 1 : for {
891 1 : i.offset = i.nextOffset
892 1 : i.readEntry()
893 1 : // When hidden keys are common, there is additional optimization possible
894 1 : // by not caching entries that are hidden (note that some calls to
895 1 : // cacheEntry don't decode the internal key before caching, but checking
896 1 : // whether a key is hidden does not require full decoding). However, we do
897 1 : // need to use the blockEntry.offset in the cache for the first entry at
898 1 : // the reset point to do the binary search when the cache is empty -- so
899 1 : // we would need to cache that first entry (though not the key) even if
900 1 : // was hidden. Our current assumption is that if there are large numbers
901 1 : // of hidden keys we will be able to skip whole blocks (using block
902 1 : // property filters) so we don't bother optimizing.
903 1 : hiddenPoint := i.decodeInternalKey(i.key)
904 1 : i.maybeReplaceSuffix()
905 1 :
906 1 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
907 1 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
908 1 : // that we can reverse.
909 1 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
910 1 : // The current key is greater than or equal to our search key. Back up to
911 1 : // the previous key which was less than our search key. Note that this for
912 1 : // loop will execute at least once with this if-block not being true, so
913 1 : // the key we are backing up to is the last one this loop cached.
914 1 : return i.Prev()
915 1 : }
916 :
917 1 : if i.nextOffset >= targetOffset {
918 1 : // We've reached the end of the current restart block. Return the
919 1 : // current key if not hidden, else call Prev().
920 1 : //
921 1 : // When the restart interval is 1, the first iteration of the for loop
922 1 : // will bring us here. In that case ikey is backed by the block so we
923 1 : // get the desired key stability guarantee for the lifetime of the
924 1 : // blockIter. That is, we never cache anything and therefore never
925 1 : // return a key backed by cachedBuf.
926 1 : if hiddenPoint {
927 1 : return i.Prev()
928 1 : }
929 1 : break
930 : }
931 1 : i.cacheEntry()
932 : }
933 :
934 1 : if !i.valid() {
935 1 : return nil
936 1 : }
937 1 : if !i.lazyValueHandling.hasValuePrefix ||
938 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
939 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
940 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
941 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
942 1 : } else {
943 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
944 1 : }
945 1 : return &i.ikv
946 : }
947 :
948 : // First implements internalIterator.First, as documented in the pebble
949 : // package.
950 1 : func (i *blockIter) First() *base.InternalKV {
951 1 : if invariants.Enabled && i.isDataInvalidated() {
952 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
953 : }
954 :
955 1 : i.offset = 0
956 1 : if !i.valid() {
957 1 : return nil
958 1 : }
959 1 : i.clearCache()
960 1 : i.readEntry()
961 1 : hiddenPoint := i.decodeInternalKey(i.key)
962 1 : if hiddenPoint {
963 1 : return i.Next()
964 1 : }
965 1 : i.maybeReplaceSuffix()
966 1 : if !i.lazyValueHandling.hasValuePrefix ||
967 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
968 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
969 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
970 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
971 1 : } else {
972 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
973 1 : }
974 1 : return &i.ikv
975 : }
976 :
977 1 : func decodeRestart(b []byte) int32 {
978 1 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
979 1 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
980 1 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
981 1 : }
982 :
983 : // Last implements internalIterator.Last, as documented in the pebble package.
984 1 : func (i *blockIter) Last() *base.InternalKV {
985 1 : if invariants.Enabled && i.isDataInvalidated() {
986 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
987 : }
988 :
989 : // Seek forward from the last restart point.
990 1 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
991 1 : if !i.valid() {
992 1 : return nil
993 1 : }
994 :
995 1 : i.readEntry()
996 1 : i.clearCache()
997 1 :
998 1 : for i.nextOffset < i.restarts {
999 1 : i.cacheEntry()
1000 1 : i.offset = i.nextOffset
1001 1 : i.readEntry()
1002 1 : }
1003 :
1004 1 : hiddenPoint := i.decodeInternalKey(i.key)
1005 1 : if hiddenPoint {
1006 1 : return i.Prev()
1007 1 : }
1008 1 : i.maybeReplaceSuffix()
1009 1 : if !i.lazyValueHandling.hasValuePrefix ||
1010 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1011 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1012 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1013 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1014 1 : } else {
1015 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1016 1 : }
1017 1 : return &i.ikv
1018 : }
1019 :
1020 : // Next implements internalIterator.Next, as documented in the pebble
1021 : // package.
1022 1 : func (i *blockIter) Next() *base.InternalKV {
1023 1 : if len(i.cachedBuf) > 0 {
1024 1 : // We're switching from reverse iteration to forward iteration. We need to
1025 1 : // populate i.fullKey with the current key we're positioned at so that
1026 1 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1027 1 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1028 1 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1029 1 : // copying into i.fullKey works for both cases.
1030 1 : //
1031 1 : // TODO(peter): Rather than clearing the cache, we could instead use the
1032 1 : // cache until it is exhausted. This would likely be faster than falling
1033 1 : // through to the normal forward iteration code below.
1034 1 : i.fullKey = append(i.fullKey[:0], i.key...)
1035 1 : i.clearCache()
1036 1 : }
1037 :
1038 : start:
1039 1 : i.offset = i.nextOffset
1040 1 : if !i.valid() {
1041 1 : return nil
1042 1 : }
1043 1 : i.readEntry()
1044 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1045 1 : if n := len(i.key) - 8; n >= 0 {
1046 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1047 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1048 1 : (trailer&trailerObsoleteBit != 0)
1049 1 : i.ikv.K.Trailer = trailer & trailerObsoleteMask
1050 1 : i.ikv.K.UserKey = i.key[:n:n]
1051 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1052 1 : i.ikv.K.SetSeqNum(uint64(n))
1053 1 : }
1054 1 : if hiddenPoint {
1055 1 : goto start
1056 : }
1057 1 : if i.transforms.SyntheticSuffix.IsSet() {
1058 1 : // Inlined version of i.maybeReplaceSuffix()
1059 1 : prefixLen := i.split(i.ikv.K.UserKey)
1060 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1061 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1062 1 : i.ikv.K.UserKey = i.synthSuffixBuf
1063 1 : }
1064 0 : } else {
1065 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1066 0 : i.ikv.K.UserKey = nil
1067 0 : }
1068 1 : if !i.lazyValueHandling.hasValuePrefix ||
1069 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1070 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1071 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1072 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1073 1 : } else {
1074 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1075 1 : }
1076 1 : return &i.ikv
1077 : }
1078 :
1079 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1080 1 : func (i *blockIter) NextPrefix(succKey []byte) *base.InternalKV {
1081 1 : if i.lazyValueHandling.hasValuePrefix {
1082 1 : return i.nextPrefixV3(succKey)
1083 1 : }
1084 1 : const nextsBeforeSeek = 3
1085 1 : kv := i.Next()
1086 1 : for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
1087 1 : if j >= nextsBeforeSeek {
1088 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1089 1 : }
1090 1 : kv = i.Next()
1091 : }
1092 1 : return kv
1093 : }
1094 :
1095 1 : func (i *blockIter) nextPrefixV3(succKey []byte) *base.InternalKV {
1096 1 : // Doing nexts that involve a key comparison can be expensive (and the cost
1097 1 : // depends on the key length), so we use the same threshold of 3 that we use
1098 1 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1099 1 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1100 1 : // than doing a SeekGE within the block, so we do this 16 times
1101 1 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1102 1 : // search for the restart consumes > 100ns. If the number of versions is >
1103 1 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1104 1 : // on average need to find a key between two restarts, so another 8 steps
1105 1 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1106 1 : // steps.
1107 1 : //
1108 1 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1109 1 : // was written (which we don't currently store) instead of the default value
1110 1 : // of 16.
1111 1 : const nextCmpThresholdBeforeSeek = 3
1112 1 : const nextFastThresholdBeforeRestarts = 16
1113 1 : nextCmpCount := 0
1114 1 : nextFastCount := 0
1115 1 : usedRestarts := false
1116 1 : // INVARIANT: blockIter is valid.
1117 1 : if invariants.Enabled && !i.valid() {
1118 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1119 : }
1120 1 : prevKeyIsSet := i.ikv.Kind() == InternalKeyKindSet
1121 1 : for {
1122 1 : i.offset = i.nextOffset
1123 1 : if !i.valid() {
1124 1 : return nil
1125 1 : }
1126 : // Need to decode the length integers, so we can compute nextOffset.
1127 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1128 1 : // This is an ugly performance hack. Reading entries from blocks is one of
1129 1 : // the inner-most routines and decoding the 3 varints per-entry takes
1130 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1131 1 : // us, so we do it manually. This provides a 10-15% performance improvement
1132 1 : // on blockIter benchmarks on both go1.11 and go1.12.
1133 1 : //
1134 1 : // TODO(peter): remove this hack if go:inline is ever supported.
1135 1 :
1136 1 : // Decode the shared key length integer.
1137 1 : var shared uint32
1138 1 : if a := *((*uint8)(ptr)); a < 128 {
1139 1 : shared = uint32(a)
1140 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1141 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1142 0 : shared = uint32(b)<<7 | uint32(a)
1143 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1144 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1145 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1146 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1147 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1148 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1149 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1150 0 : } else {
1151 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1152 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1153 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1154 0 : }
1155 : // Decode the unshared key length integer.
1156 1 : var unshared uint32
1157 1 : if a := *((*uint8)(ptr)); a < 128 {
1158 1 : unshared = uint32(a)
1159 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1160 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1161 0 : unshared = uint32(b)<<7 | uint32(a)
1162 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1163 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1164 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1165 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1166 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1167 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1168 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1169 0 : } else {
1170 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1171 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1172 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1173 0 : }
1174 : // Decode the value length integer.
1175 1 : var value uint32
1176 1 : if a := *((*uint8)(ptr)); a < 128 {
1177 1 : value = uint32(a)
1178 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1179 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1180 0 : value = uint32(b)<<7 | uint32(a)
1181 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1182 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1183 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1184 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1185 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1186 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1187 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1188 0 : } else {
1189 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1190 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1191 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1192 0 : }
1193 1 : if i.transforms.SyntheticPrefix != nil {
1194 0 : shared += uint32(len(i.transforms.SyntheticPrefix))
1195 0 : }
1196 : // The starting position of the value.
1197 1 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1198 1 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1199 1 : if invariants.Enabled && unshared < 8 {
1200 0 : // This should not happen since only the key prefix is shared, so even
1201 0 : // if the prefix length is the same as the user key length, the unshared
1202 0 : // will include the trailer.
1203 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1204 : }
1205 : // The trailer is written in little endian, so the key kind is the first
1206 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1207 1 : keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1208 1 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1209 1 : prefixChanged := false
1210 1 : if keyKind == InternalKeyKindSet {
1211 1 : if invariants.Enabled && value == 0 {
1212 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1213 : }
1214 1 : valPrefix := *((*valuePrefix)(valuePtr))
1215 1 : if setHasSamePrefix(valPrefix) {
1216 1 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1217 1 : // that subsequent keys will not have a shared length that is greater
1218 1 : // than the prefix of the current key, which is also the prefix of
1219 1 : // i.key. Since we are continuing to iterate, we don't need to
1220 1 : // initialize i.ikey and i.lazyValue (these are initialized before
1221 1 : // returning).
1222 1 : nextFastCount++
1223 1 : if nextFastCount > nextFastThresholdBeforeRestarts {
1224 0 : if usedRestarts {
1225 0 : // Exhausted iteration budget. This will never happen unless
1226 0 : // someone is using a restart interval > 16. It is just to guard
1227 0 : // against long restart intervals causing too much iteration.
1228 0 : break
1229 : }
1230 : // Haven't used restarts yet, so find the first restart at or beyond
1231 : // the current offset.
1232 0 : targetOffset := i.offset
1233 0 : var index int32
1234 0 : {
1235 0 : // NB: manually inlined sort.Sort is ~5% faster.
1236 0 : //
1237 0 : // f defined for a restart point is true iff the offset >=
1238 0 : // targetOffset.
1239 0 : // Define f(-1) == false and f(i.numRestarts) == true.
1240 0 : // Invariant: f(index-1) == false, f(upper) == true.
1241 0 : upper := i.numRestarts
1242 0 : for index < upper {
1243 0 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1244 0 : // index ≤ h < upper
1245 0 : offset := decodeRestart(i.data[i.restarts+4*h:])
1246 0 : if offset < targetOffset {
1247 0 : index = h + 1 // preserves f(index-1) == false
1248 0 : } else {
1249 0 : upper = h // preserves f(upper) == true
1250 0 : }
1251 : }
1252 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1253 : // => answer is index.
1254 : }
1255 0 : usedRestarts = true
1256 0 : nextFastCount = 0
1257 0 : if index == i.numRestarts {
1258 0 : // Already past the last real restart, so iterate a bit more until
1259 0 : // we are done with the block.
1260 0 : continue
1261 : }
1262 : // Have some real restarts after index. NB: index is the first
1263 : // restart at or beyond the current offset.
1264 0 : startingIndex := index
1265 0 : for index != i.numRestarts &&
1266 0 : // The restart at index is 4 bytes written in little endian format
1267 0 : // starting at i.restart+4*index. The 0th byte is the least
1268 0 : // significant and the 3rd byte is the most significant. Since the
1269 0 : // most significant bit of the 3rd byte is what we use for
1270 0 : // encoding the set-has-same-prefix information, the indexing
1271 0 : // below has +3.
1272 0 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1273 0 : // We still have the same prefix, so move to the next restart.
1274 0 : index++
1275 0 : }
1276 : // index is the first restart that did not have the same prefix.
1277 0 : if index != startingIndex {
1278 0 : // Managed to skip past at least one restart. Resume iteration
1279 0 : // from index-1. Since nextFastCount has been reset to 0, we
1280 0 : // should be able to iterate to the next prefix.
1281 0 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1282 0 : i.readEntry()
1283 0 : }
1284 : // Else, unable to skip past any restart. Resume iteration. Since
1285 : // nextFastCount has been reset to 0, we should be able to iterate
1286 : // to the next prefix.
1287 0 : continue
1288 : }
1289 1 : continue
1290 1 : } else if prevKeyIsSet {
1291 1 : prefixChanged = true
1292 1 : }
1293 1 : } else {
1294 1 : prevKeyIsSet = false
1295 1 : }
1296 : // Slow-path cases:
1297 : // - (Likely) The prefix has changed.
1298 : // - (Unlikely) The prefix has not changed.
1299 : // We assemble the key etc. under the assumption that it is the likely
1300 : // case.
1301 1 : unsharedKey := getBytes(ptr, int(unshared))
1302 1 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1303 1 : // since the current logic assumes we have always copied the latest key
1304 1 : // into fullKey, which is why when we get to the next key we can (a)
1305 1 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1306 1 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1307 1 : // valid (even if unshared). For (b), we will need to remember whether
1308 1 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1309 1 : // in the former case and for the latter case need to copy the shared part
1310 1 : // too. This same comment applies to the other place where we can do this
1311 1 : // optimization, in readEntry().
1312 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1313 1 : i.val = getBytes(valuePtr, int(value))
1314 1 : if shared == 0 {
1315 1 : // Provide stability for the key across positioning calls if the key
1316 1 : // doesn't share a prefix with the previous key. This removes requiring the
1317 1 : // key to be copied if the caller knows the block has a restart interval of
1318 1 : // 1. An important example of this is range-del blocks.
1319 1 : i.key = unsharedKey
1320 1 : } else {
1321 1 : i.key = i.fullKey
1322 1 : }
1323 : // Manually inlined version of i.decodeInternalKey(i.key).
1324 1 : hiddenPoint := false
1325 1 : if n := len(i.key) - 8; n >= 0 {
1326 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1327 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
1328 1 : (trailer&trailerObsoleteBit != 0)
1329 1 : i.ikv.K = base.InternalKey{
1330 1 : Trailer: trailer & trailerObsoleteMask,
1331 1 : UserKey: i.key[:n:n],
1332 1 : }
1333 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1334 0 : i.ikv.K.SetSeqNum(uint64(n))
1335 0 : }
1336 1 : if i.transforms.SyntheticSuffix.IsSet() {
1337 0 : // Inlined version of i.maybeReplaceSuffix()
1338 0 : prefixLen := i.split(i.ikv.K.UserKey)
1339 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1340 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1341 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1342 0 : }
1343 0 : } else {
1344 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1345 0 : i.ikv.K.UserKey = nil
1346 0 : }
1347 1 : nextCmpCount++
1348 1 : if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
1349 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1350 0 : i.ikv.K.UserKey, succKey))
1351 : }
1352 1 : if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
1353 1 : // Prefix has changed.
1354 1 : if hiddenPoint {
1355 1 : return i.Next()
1356 1 : }
1357 1 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1358 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1359 : }
1360 1 : if base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1361 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1362 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1363 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1364 1 : } else {
1365 0 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1366 0 : }
1367 1 : return &i.ikv
1368 : }
1369 : // Else prefix has not changed.
1370 :
1371 1 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1372 0 : break
1373 : }
1374 : }
1375 0 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1376 : }
1377 :
1378 : // Prev implements internalIterator.Prev, as documented in the pebble
1379 : // package.
1380 1 : func (i *blockIter) Prev() *base.InternalKV {
1381 1 : start:
1382 1 : for n := len(i.cached) - 1; n >= 0; n-- {
1383 1 : i.nextOffset = i.offset
1384 1 : e := &i.cached[n]
1385 1 : i.offset = e.offset
1386 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1387 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1388 1 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1389 1 : if n := len(i.key) - 8; n >= 0 {
1390 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1391 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1392 1 : (trailer&trailerObsoleteBit != 0)
1393 1 : if hiddenPoint {
1394 1 : continue
1395 : }
1396 1 : i.ikv.K = base.InternalKey{
1397 1 : Trailer: trailer & trailerObsoleteMask,
1398 1 : UserKey: i.key[:n:n],
1399 1 : }
1400 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1401 1 : i.ikv.K.SetSeqNum(uint64(n))
1402 1 : }
1403 1 : if i.transforms.SyntheticSuffix.IsSet() {
1404 0 : // Inlined version of i.maybeReplaceSuffix()
1405 0 : prefixLen := i.split(i.ikv.K.UserKey)
1406 0 : // If ikey is cached or may get cached, we must de-reference
1407 0 : // UserKey before suffix replacement.
1408 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1409 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1410 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1411 0 : }
1412 0 : } else {
1413 0 : i.ikv.K.Trailer = uint64(InternalKeyKindInvalid)
1414 0 : i.ikv.K.UserKey = nil
1415 0 : }
1416 1 : i.cached = i.cached[:n]
1417 1 : if !i.lazyValueHandling.hasValuePrefix ||
1418 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1419 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1420 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1421 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1422 1 : } else {
1423 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1424 1 : }
1425 1 : return &i.ikv
1426 : }
1427 :
1428 1 : i.clearCache()
1429 1 : if i.offset <= 0 {
1430 1 : i.offset = -1
1431 1 : i.nextOffset = 0
1432 1 : return nil
1433 1 : }
1434 :
1435 1 : targetOffset := i.offset
1436 1 : var index int32
1437 1 :
1438 1 : {
1439 1 : // NB: manually inlined sort.Sort is ~5% faster.
1440 1 : //
1441 1 : // Define f(-1) == false and f(n) == true.
1442 1 : // Invariant: f(index-1) == false, f(upper) == true.
1443 1 : upper := i.numRestarts
1444 1 : for index < upper {
1445 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1446 1 : // index ≤ h < upper
1447 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1448 1 : if offset < targetOffset {
1449 1 : // Looking for the first restart that has offset >= targetOffset, so
1450 1 : // ignore h and earlier.
1451 1 : index = h + 1 // preserves f(i-1) == false
1452 1 : } else {
1453 1 : upper = h // preserves f(j) == true
1454 1 : }
1455 : }
1456 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1457 : // => answer is index.
1458 : }
1459 :
1460 : // index is first restart with offset >= targetOffset. Note that
1461 : // targetOffset may not be at a restart point since one can call Prev()
1462 : // after Next() (so the cache was not populated) and targetOffset refers to
1463 : // the current entry. index-1 must have an offset < targetOffset (it can't
1464 : // be equal to targetOffset since the binary search would have selected that
1465 : // as the index).
1466 1 : i.offset = 0
1467 1 : if index > 0 {
1468 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1469 1 : }
1470 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1471 : // valid offset.
1472 :
1473 1 : i.readEntry()
1474 1 :
1475 1 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1476 1 : // entry we are stepping back from, and we don't need to cache the entry
1477 1 : // before it, since it is the candidate to return.
1478 1 : for i.nextOffset < targetOffset {
1479 1 : i.cacheEntry()
1480 1 : i.offset = i.nextOffset
1481 1 : i.readEntry()
1482 1 : }
1483 :
1484 1 : hiddenPoint := i.decodeInternalKey(i.key)
1485 1 : if hiddenPoint {
1486 1 : // Use the cache.
1487 1 : goto start
1488 : }
1489 1 : if i.transforms.SyntheticSuffix.IsSet() {
1490 0 : // Inlined version of i.maybeReplaceSuffix()
1491 0 : prefixLen := i.split(i.ikv.K.UserKey)
1492 0 : // If ikey is cached or may get cached, we must de-reference
1493 0 : // UserKey before suffix replacement.
1494 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1495 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1496 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1497 0 : }
1498 1 : if !i.lazyValueHandling.hasValuePrefix ||
1499 1 : base.TrailerKind(i.ikv.K.Trailer) != InternalKeyKindSet {
1500 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1501 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1502 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1503 1 : } else {
1504 1 : i.ikv.V = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1505 1 : }
1506 1 : return &i.ikv
1507 : }
1508 :
1509 : // Key returns the internal key at the current iterator position.
1510 1 : func (i *blockIter) Key() *InternalKey {
1511 1 : return &i.ikv.K
1512 1 : }
1513 :
1514 : // KV returns the internal KV at the current iterator position.
1515 1 : func (i *blockIter) KV() *base.InternalKV {
1516 1 : return &i.ikv
1517 1 : }
1518 :
1519 1 : func (i *blockIter) value() base.LazyValue {
1520 1 : return i.ikv.V
1521 1 : }
1522 :
1523 : // Error implements internalIterator.Error, as documented in the pebble
1524 : // package.
1525 1 : func (i *blockIter) Error() error {
1526 1 : return nil // infallible
1527 1 : }
1528 :
1529 : // Close implements internalIterator.Close, as documented in the pebble
1530 : // package.
1531 1 : func (i *blockIter) Close() error {
1532 1 : i.handle.Release()
1533 1 : i.handle = bufferHandle{}
1534 1 : i.val = nil
1535 1 : i.ikv = base.InternalKV{}
1536 1 : i.lazyValueHandling.vbr = nil
1537 1 : return nil
1538 1 : }
1539 :
1540 0 : func (i *blockIter) SetBounds(lower, upper []byte) {
1541 0 : // This should never be called as bounds are handled by sstable.Iterator.
1542 0 : panic("pebble: SetBounds unimplemented")
1543 : }
1544 :
1545 0 : func (i *blockIter) SetContext(_ context.Context) {}
1546 :
1547 1 : func (i *blockIter) valid() bool {
1548 1 : return i.offset >= 0 && i.offset < i.restarts
1549 1 : }
|