Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "unsafe"
12 :
13 : "github.com/cockroachdb/errors"
14 : "github.com/cockroachdb/pebble/internal/base"
15 : "github.com/cockroachdb/pebble/internal/invariants"
16 : "github.com/cockroachdb/pebble/internal/manual"
17 : "golang.org/x/exp/slices"
18 : )
19 :
20 : // blockIter is an iterator over a single block of data.
21 : //
22 : // A blockIter provides an additional guarantee around key stability when a
23 : // block has a restart interval of 1 (i.e. when there is no prefix
24 : // compression). Key stability refers to whether the InternalKey.UserKey bytes
25 : // returned by a positioning call will remain stable after a subsequent
26 : // positioning call. The normal case is that a positioning call will invalidate
27 : // any previously returned InternalKey.UserKey. If a block has a restart
28 : // interval of 1 (no prefix compression), blockIter guarantees that
29 : // InternalKey.UserKey will point to the key as stored in the block itself
30 : // which will remain valid until the blockIter is closed. The key stability
31 : // guarantee is used by the range tombstone and range key code, which knows that
32 : // the respective blocks are always encoded with a restart interval of 1. This
33 : // per-block key stability guarantee is sufficient for range tombstones and
34 : // range deletes as they are always encoded in a single block. Note: this
35 : // stability guarantee no longer holds for a block iter with synthetic suffix
36 : // replacement, but this doesn't matter, as the user will not open
37 : // an iterator with a synthetic suffix on a block with rangekeys (for now).
38 : //
39 : // A blockIter also provides a value stability guarantee for range deletions and
40 : // range keys since there is only a single range deletion and range key block
41 : // per sstable and the blockIter will not release the bytes for the block until
42 : // it is closed.
43 : //
44 : // Note on why blockIter knows about lazyValueHandling:
45 : //
46 : // blockIter's positioning functions (that return a LazyValue), are too
47 : // complex to inline even prior to lazyValueHandling. blockIter.Next and
48 : // blockIter.First were by far the cheapest and had costs 195 and 180
49 : // respectively, which exceeds the budget of 80. We initially tried to keep
50 : // the lazyValueHandling logic out of blockIter by wrapping it with a
51 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
52 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
53 : // they called the corresponding blockIter func and then decided whether the
54 : // value was in fact in-place (so return immediately) or needed further
55 : // handling. But these also turned out too costly for mid-stack inlining since
56 : // simple calls like the following have a high cost that is barely under the
57 : // budget of 80
58 : //
59 : // k, v := i.data.SeekGE(key, flags) // cost 74
60 : // k, v := i.data.Next() // cost 72
61 : //
62 : // We have 2 options for minimizing performance regressions:
63 : // - Include the lazyValueHandling logic in the already non-inlineable
64 : // blockIter functions: Since most of the time is spent in data block iters,
65 : // it is acceptable to take the small hit of unnecessary branching (which
66 : // hopefully branch prediction will predict correctly) for other kinds of
67 : // blocks.
68 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
69 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
70 : // v3 sstable. We would want to manage these copies via code generation.
71 : //
72 : // We have picked the first option here.
73 : type blockIter struct {
74 : cmp Compare
75 : split Split
76 :
77 : // Iterator transforms.
78 : //
79 : // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
80 : // before the key is returned to the user. A sequence of iter operations on a
81 : // block with a syntheticSuffix rule should return keys as if those operations
82 : // ran on a block with keys that all had the syntheticSuffix. As an example:
83 : // any sequence of block iter cmds should return the same keys for the
84 : // following two blocks:
85 : //
86 : // blockA: a@3,b@3,c@3
87 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
88 : //
89 : // To ensure this, Suffix replacement will not change the ordering of keys in
90 : // the block because the iter assumes that no two keys in the block share the
91 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
92 : // iterator handles "off by one" errors (explained in more detail in those
93 : // functions) when, for a given key, originalSuffix < searchSuffix <
94 : // replacementSuffix, with integer comparison. To handle these cases, the
95 : // iterator assumes:
96 : //
97 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
98 : // for keys with a suffix.
99 : //
100 : // NB: it is possible for a block iter to add a synthetic suffix on a key
101 : // without a suffix, which implies
102 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
103 : // however, the iterator would never need to handle an off by one error in
104 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
105 : // integer comparison.
106 : //
107 : //
108 : // In addition, we also assume that any block with rangekeys will not contain
109 : // a synthetic suffix.
110 : transforms IterTransforms
111 :
112 : // offset is the byte index that marks where the current key/value is
113 : // encoded in the block.
114 : offset int32
115 : // nextOffset is the byte index where the next key/value is encoded in the
116 : // block.
117 : nextOffset int32
118 : // A "restart point" in a block is a point where the full key is encoded,
119 : // instead of just having a suffix of the key encoded. See readEntry() for
120 : // how prefix compression of keys works. Keys in between two restart points
121 : // only have a suffix encoded in the block. When restart interval is 1, no
122 : // prefix compression of keys happens. This is the case with range tombstone
123 : // blocks.
124 : //
125 : // All restart offsets are listed in increasing order in
126 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
127 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
128 : // therefore be seen as the point where data in the block ends, and a list
129 : // of offsets of all restart points begins.
130 : restarts int32
131 : // Number of restart points in this block. Encoded at the end of the block
132 : // as a uint32.
133 : numRestarts int32
134 : ptr unsafe.Pointer
135 : data []byte
136 : // key contains the raw key the iterator is currently pointed at. This may
137 : // point directly to data stored in the block (for a key which has no prefix
138 : // compression), to fullKey (for a prefix compressed key), or to a slice of
139 : // data stored in cachedBuf (during reverse iteration).
140 : //
141 : // NB: In general, key contains the same logical content as ikey
142 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
143 : // replacement rule, this will not be the case. Therefore, key should never
144 : // be used after ikey is set.
145 : key []byte
146 : // fullKey is a buffer used for key prefix decompression. Note that if
147 : // transforms.SyntheticPrifix is not nil, fullKey always starts with that
148 : // prefix.
149 : fullKey []byte
150 : // val contains the value the iterator is currently pointed at. If non-nil,
151 : // this points to a slice of the block data.
152 : val []byte
153 : // lazyValue is val turned into a LazyValue, whenever a positioning method
154 : // returns a non-nil key-value pair.
155 : lazyValue base.LazyValue
156 : // ikey contains the decoded InternalKey the iterator is currently pointed
157 : // at. Note that the memory backing ikey.UserKey is either data stored
158 : // directly in the block, fullKey, cachedBuf, or synthSuffixBuf. The key
159 : // stability guarantee for blocks built with a restart interval of 1 is
160 : // achieved by having ikey.UserKey always point to data stored directly in the
161 : // block.
162 : ikey InternalKey
163 : // cached and cachedBuf are used during reverse iteration. They are needed
164 : // because we can't perform prefix decoding in reverse, only in the forward
165 : // direction. In order to iterate in reverse, we decode and cache the entries
166 : // between two restart points.
167 : //
168 : // Note that cached[len(cached)-1] contains the previous entry to the one the
169 : // blockIter is currently pointed at. As usual, nextOffset will contain the
170 : // offset of the next entry. During reverse iteration, nextOffset will be
171 : // updated to point to offset, and we'll set the blockIter to point at the
172 : // entry cached[len(cached)-1]. See Prev() for more details.
173 : //
174 : // For a block encoded with a restart interval of 1, cached and cachedBuf
175 : // will not be used as there are no prefix compressed entries between the
176 : // restart points.
177 : cached []blockEntry
178 : cachedBuf []byte
179 : handle bufferHandle
180 : // for block iteration for already loaded blocks.
181 : firstUserKey []byte
182 : lazyValueHandling struct {
183 : vbr *valueBlockReader
184 : hasValuePrefix bool
185 : }
186 : synthSuffixBuf []byte
187 : firstUserKeyWithPrefixBuf []byte
188 : }
189 :
190 : type blockEntry struct {
191 : offset int32
192 : keyStart int32
193 : keyEnd int32
194 : valStart int32
195 : valSize int32
196 : }
197 :
198 : // blockIter implements the base.InternalIterator interface.
199 : var _ base.InternalIterator = (*blockIter)(nil)
200 :
201 : func newBlockIter(
202 : cmp Compare, split Split, block block, transforms IterTransforms,
203 1 : ) (*blockIter, error) {
204 1 : i := &blockIter{}
205 1 : return i, i.init(cmp, split, block, transforms)
206 1 : }
207 :
208 0 : func (i *blockIter) String() string {
209 0 : return "block"
210 0 : }
211 :
212 1 : func (i *blockIter) init(cmp Compare, split Split, block block, transforms IterTransforms) error {
213 1 : numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
214 1 : if numRestarts == 0 {
215 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
216 0 : }
217 1 : i.transforms = transforms
218 1 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
219 1 : i.split = split
220 1 : i.cmp = cmp
221 1 : i.restarts = int32(len(block)) - 4*(1+numRestarts)
222 1 : i.numRestarts = numRestarts
223 1 : i.ptr = unsafe.Pointer(&block[0])
224 1 : i.data = block
225 1 : if i.transforms.SyntheticPrefix.IsSet() {
226 1 : i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
227 1 : } else {
228 1 : i.fullKey = i.fullKey[:0]
229 1 : }
230 1 : i.val = nil
231 1 : i.clearCache()
232 1 : if i.restarts > 0 {
233 1 : if err := i.readFirstKey(); err != nil {
234 0 : return err
235 0 : }
236 1 : } else {
237 1 : // Block is empty.
238 1 : i.firstUserKey = nil
239 1 : }
240 1 : return nil
241 : }
242 :
243 : // NB: two cases of hideObsoletePoints:
244 : // - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
245 : // ingested.
246 : // - Foreign sstable iteration: syntheticSeqNum is always set.
247 : func (i *blockIter) initHandle(
248 : cmp Compare, split Split, block bufferHandle, transforms IterTransforms,
249 1 : ) error {
250 1 : i.handle.Release()
251 1 : i.handle = block
252 1 : return i.init(cmp, split, block.Get(), transforms)
253 1 : }
254 :
255 1 : func (i *blockIter) invalidate() {
256 1 : i.clearCache()
257 1 : i.offset = 0
258 1 : i.nextOffset = 0
259 1 : i.restarts = 0
260 1 : i.numRestarts = 0
261 1 : i.data = nil
262 1 : }
263 :
264 : // isDataInvalidated returns true when the blockIter has been invalidated
265 : // using an invalidate call. NB: this is different from blockIter.Valid
266 : // which is part of the InternalIterator implementation.
267 1 : func (i *blockIter) isDataInvalidated() bool {
268 1 : return i.data == nil
269 1 : }
270 :
271 1 : func (i *blockIter) resetForReuse() blockIter {
272 1 : return blockIter{
273 1 : fullKey: i.fullKey[:0],
274 1 : cached: i.cached[:0],
275 1 : cachedBuf: i.cachedBuf[:0],
276 1 : firstUserKeyWithPrefixBuf: i.firstUserKeyWithPrefixBuf[:0],
277 1 : data: nil,
278 1 : }
279 1 : }
280 :
281 1 : func (i *blockIter) readEntry() {
282 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
283 1 :
284 1 : // This is an ugly performance hack. Reading entries from blocks is one of
285 1 : // the inner-most routines and decoding the 3 varints per-entry takes
286 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
287 1 : // us, so we do it manually. This provides a 10-15% performance improvement
288 1 : // on blockIter benchmarks on both go1.11 and go1.12.
289 1 : //
290 1 : // TODO(peter): remove this hack if go:inline is ever supported.
291 1 :
292 1 : var shared uint32
293 1 : if a := *((*uint8)(ptr)); a < 128 {
294 1 : shared = uint32(a)
295 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
296 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
297 0 : shared = uint32(b)<<7 | uint32(a)
298 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
299 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
300 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
301 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
302 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
303 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
304 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
305 0 : } else {
306 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
307 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
308 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
309 0 : }
310 :
311 1 : var unshared uint32
312 1 : if a := *((*uint8)(ptr)); a < 128 {
313 1 : unshared = uint32(a)
314 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
315 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
316 0 : unshared = uint32(b)<<7 | uint32(a)
317 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
318 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
319 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
320 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
321 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
322 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
323 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
324 0 : } else {
325 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
326 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
327 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
328 0 : }
329 :
330 1 : var value uint32
331 1 : if a := *((*uint8)(ptr)); a < 128 {
332 1 : value = uint32(a)
333 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
334 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
335 1 : value = uint32(b)<<7 | uint32(a)
336 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
337 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
338 1 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
339 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
340 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
341 1 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
342 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
343 1 : } else {
344 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
345 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
346 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
347 0 : }
348 1 : shared += uint32(len(i.transforms.SyntheticPrefix))
349 1 : unsharedKey := getBytes(ptr, int(unshared))
350 1 : // TODO(sumeer): move this into the else block below.
351 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
352 1 : if shared == 0 {
353 1 : // Provide stability for the key across positioning calls if the key
354 1 : // doesn't share a prefix with the previous key. This removes requiring the
355 1 : // key to be copied if the caller knows the block has a restart interval of
356 1 : // 1. An important example of this is range-del blocks.
357 1 : i.key = unsharedKey
358 1 : } else {
359 1 : i.key = i.fullKey
360 1 : }
361 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
362 1 : i.val = getBytes(ptr, int(value))
363 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
364 : }
365 :
366 1 : func (i *blockIter) readFirstKey() error {
367 1 : ptr := i.ptr
368 1 :
369 1 : // This is an ugly performance hack. Reading entries from blocks is one of
370 1 : // the inner-most routines and decoding the 3 varints per-entry takes
371 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
372 1 : // us, so we do it manually. This provides a 10-15% performance improvement
373 1 : // on blockIter benchmarks on both go1.11 and go1.12.
374 1 : //
375 1 : // TODO(peter): remove this hack if go:inline is ever supported.
376 1 :
377 1 : if shared := *((*uint8)(ptr)); shared == 0 {
378 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
379 1 : } else {
380 0 : // The shared length is != 0, which is invalid.
381 0 : panic("first key in block must have zero shared length")
382 : }
383 :
384 1 : var unshared uint32
385 1 : if a := *((*uint8)(ptr)); a < 128 {
386 1 : unshared = uint32(a)
387 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
388 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
389 0 : unshared = uint32(b)<<7 | uint32(a)
390 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
391 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
392 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
393 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
394 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
395 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
396 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
397 0 : } else {
398 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
399 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
400 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
401 0 : }
402 :
403 : // Skip the value length.
404 1 : if a := *((*uint8)(ptr)); a < 128 {
405 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
406 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
407 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
408 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
409 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
410 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
411 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
412 1 : } else {
413 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
414 0 : }
415 :
416 1 : firstKey := getBytes(ptr, int(unshared))
417 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
418 1 : // BlockIter benchmarks.
419 1 : if n := len(firstKey) - 8; n >= 0 {
420 1 : i.firstUserKey = firstKey[:n:n]
421 1 : } else {
422 0 : i.firstUserKey = nil
423 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
424 0 : }
425 1 : if i.transforms.SyntheticPrefix != nil {
426 1 : i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
427 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
428 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
429 1 : i.firstUserKey = i.firstUserKeyWithPrefixBuf
430 1 : }
431 1 : return nil
432 : }
433 :
434 : // The sstable internal obsolete bit is set when writing a block and unset by
435 : // blockIter, so no code outside block writing/reading code ever sees it.
436 : const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
437 : const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
438 :
439 1 : func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
440 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
441 1 : // BlockIter benchmarks.
442 1 : if n := len(key) - 8; n >= 0 {
443 1 : trailer := binary.LittleEndian.Uint64(key[n:])
444 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
445 1 : (trailer&trailerObsoleteBit != 0)
446 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
447 1 : i.ikey.UserKey = key[:n:n]
448 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
449 1 : i.ikey.SetSeqNum(uint64(n))
450 1 : }
451 1 : } else {
452 1 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
453 1 : i.ikey.UserKey = nil
454 1 : }
455 1 : return hiddenPoint
456 : }
457 :
458 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
459 : // i.transforms.syntheticSuffix.
460 1 : func (i *blockIter) maybeReplaceSuffix() {
461 1 : if i.transforms.SyntheticSuffix.IsSet() && i.ikey.UserKey != nil {
462 1 : prefixLen := i.split(i.ikey.UserKey)
463 1 : // If ikey is cached or may get cached, we must copy
464 1 : // UserKey to a new buffer before suffix replacement.
465 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
466 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
467 1 : i.ikey.UserKey = i.synthSuffixBuf
468 1 : }
469 : }
470 :
471 1 : func (i *blockIter) clearCache() {
472 1 : i.cached = i.cached[:0]
473 1 : i.cachedBuf = i.cachedBuf[:0]
474 1 : }
475 :
476 1 : func (i *blockIter) cacheEntry() {
477 1 : var valStart int32
478 1 : valSize := int32(len(i.val))
479 1 : if valSize > 0 {
480 1 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
481 1 : }
482 :
483 1 : i.cached = append(i.cached, blockEntry{
484 1 : offset: i.offset,
485 1 : keyStart: int32(len(i.cachedBuf)),
486 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
487 1 : valStart: valStart,
488 1 : valSize: valSize,
489 1 : })
490 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
491 : }
492 :
493 1 : func (i *blockIter) getFirstUserKey() []byte {
494 1 : return i.firstUserKey
495 1 : }
496 :
497 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
498 : // package.
499 1 : func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
500 1 : if invariants.Enabled && i.isDataInvalidated() {
501 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
502 : }
503 1 : searchKey := key
504 1 : if i.transforms.SyntheticPrefix != nil {
505 1 : // The seek key is before or after the entire block of keys that start with
506 1 : // SyntheticPrefix. To determine which, we need to compare against a valid
507 1 : // key in the block. We use firstUserKey which has the synthetic prefix.
508 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
509 1 : if i.cmp(i.firstUserKey, key) >= 0 {
510 1 : return i.First()
511 1 : }
512 : // Set the offset to the end of the block to mimic the offset of an
513 : // invalid iterator. This ensures a subsequent i.Prev() returns a valid
514 : // result.
515 1 : i.offset = i.restarts
516 1 : i.nextOffset = i.restarts
517 1 : return nil, base.LazyValue{}
518 : }
519 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
520 : }
521 :
522 1 : i.clearCache()
523 1 : // Find the index of the smallest restart point whose key is > the key
524 1 : // sought; index will be numRestarts if there is no such restart point.
525 1 : i.offset = 0
526 1 : var index int32
527 1 :
528 1 : {
529 1 : // NB: manually inlined sort.Seach is ~5% faster.
530 1 : //
531 1 : // Define f(-1) == false and f(n) == true.
532 1 : // Invariant: f(index-1) == false, f(upper) == true.
533 1 : upper := i.numRestarts
534 1 : for index < upper {
535 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
536 1 : // index ≤ h < upper
537 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
538 1 : // For a restart point, there are 0 bytes shared with the previous key.
539 1 : // The varint encoding of 0 occupies 1 byte.
540 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
541 1 :
542 1 : // Decode the key at that restart point, and compare it to the key
543 1 : // sought. See the comment in readEntry for why we manually inline the
544 1 : // varint decoding.
545 1 : var v1 uint32
546 1 : if a := *((*uint8)(ptr)); a < 128 {
547 1 : v1 = uint32(a)
548 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
549 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
550 0 : v1 = uint32(b)<<7 | uint32(a)
551 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
552 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
553 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
554 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
555 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
556 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
557 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
558 0 : } else {
559 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
560 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
561 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
562 0 : }
563 :
564 1 : if *((*uint8)(ptr)) < 128 {
565 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
566 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
567 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
568 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
569 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
570 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
571 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
572 0 : } else {
573 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
574 0 : }
575 :
576 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
577 : // speedup on BlockIter benchmarks.
578 1 : s := getBytes(ptr, int(v1))
579 1 : var k []byte
580 1 : if n := len(s) - 8; n >= 0 {
581 1 : k = s[:n:n]
582 1 : }
583 : // Else k is invalid, and left as nil
584 :
585 1 : if i.cmp(searchKey, k) > 0 {
586 1 : // The search key is greater than the user key at this restart point.
587 1 : // Search beyond this restart point, since we are trying to find the
588 1 : // first restart point with a user key >= the search key.
589 1 : index = h + 1 // preserves f(i-1) == false
590 1 : } else {
591 1 : // k >= search key, so prune everything after index (since index
592 1 : // satisfies the property we are looking for).
593 1 : upper = h // preserves f(j) == true
594 1 : }
595 : }
596 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
597 : // => answer is index.
598 : }
599 :
600 : // index is the first restart point with key >= search key. Define the keys
601 : // between a restart point and the next restart point as belonging to that
602 : // restart point.
603 : //
604 : // Since keys are strictly increasing, if index > 0 then the restart point
605 : // at index-1 will be the first one that has some keys belonging to it that
606 : // could be equal to the search key. If index == 0, then all keys in this
607 : // block are larger than the key sought, and offset remains at zero.
608 1 : if index > 0 {
609 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
610 1 : }
611 1 : i.readEntry()
612 1 : hiddenPoint := i.decodeInternalKey(i.key)
613 1 :
614 1 : // Iterate from that restart point to somewhere >= the key sought.
615 1 : if !i.valid() {
616 1 : return nil, base.LazyValue{}
617 1 : }
618 :
619 : // A note on seeking in a block with a suffix replacement rule: even though
620 : // the binary search above was conducted on keys without suffix replacement,
621 : // Seek will still return the correct suffix replaced key. A binary
622 : // search without suffix replacement will land on a key that is _less_ than
623 : // the key the search would have landed on if all keys were already suffix
624 : // replaced. Since Seek then conducts forward iteration to the first suffix
625 : // replaced user key that is greater than or equal to the search key, the
626 : // correct key is still returned.
627 : //
628 : // As an example, consider the following block with a restart interval of 1,
629 : // with a replacement suffix of "4":
630 : // - Pre-suffix replacement: apple@1, banana@3
631 : // - Post-suffix replacement: apple@4, banana@4
632 : //
633 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
634 : // chronological order (i.e. apple@1>apple@3), the binary search without
635 : // suffix replacement would return apple@1. A binary search with suffix
636 : // replacement would return banana@4. After beginning forward iteration from
637 : // either returned restart point, forward iteration would
638 : // always return the correct key, banana@4.
639 : //
640 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
641 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
642 : // replacement suffix), the binary search with or without suffix replacement
643 : // would land on the same key, as we assume the following:
644 : // (1) no two keys in the sst share the same prefix.
645 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
646 :
647 1 : i.maybeReplaceSuffix()
648 1 :
649 1 : if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
650 1 : // Initialize i.lazyValue
651 1 : if !i.lazyValueHandling.hasValuePrefix ||
652 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
653 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
654 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
655 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
656 1 : } else {
657 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
658 1 : }
659 1 : return &i.ikey, i.lazyValue
660 : }
661 1 : for i.Next(); i.valid(); i.Next() {
662 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
663 1 : // i.Next() has already initialized i.lazyValue.
664 1 : return &i.ikey, i.lazyValue
665 1 : }
666 : }
667 1 : return nil, base.LazyValue{}
668 : }
669 :
670 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
671 : // pebble package.
672 : func (i *blockIter) SeekPrefixGE(
673 : prefix, key []byte, flags base.SeekGEFlags,
674 0 : ) (*base.InternalKey, base.LazyValue) {
675 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
676 0 : panic("pebble: SeekPrefixGE unimplemented")
677 : }
678 :
679 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
680 : // package.
681 1 : func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
682 1 : if invariants.Enabled && i.isDataInvalidated() {
683 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
684 : }
685 1 : searchKey := key
686 1 : if i.transforms.SyntheticPrefix != nil {
687 1 : // The seek key is before or after the entire block of keys that start with
688 1 : // SyntheticPrefix. To determine which, we need to compare against a valid
689 1 : // key in the block. We use firstUserKey which has the synthetic prefix.
690 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
691 1 : if i.cmp(i.firstUserKey, key) < 0 {
692 1 : return i.Last()
693 1 : }
694 : // Set the offset to the beginning of the block to mimic an exhausted
695 : // iterator that has conducted backward interation. This ensures a
696 : // subsequent Next() call returns the first key in the block.
697 1 : i.offset = -1
698 1 : i.nextOffset = 0
699 1 : return nil, base.LazyValue{}
700 : }
701 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
702 : }
703 :
704 1 : i.clearCache()
705 1 : // Find the index of the smallest restart point whose key is >= the key
706 1 : // sought; index will be numRestarts if there is no such restart point.
707 1 : i.offset = 0
708 1 : var index int32
709 1 :
710 1 : {
711 1 : // NB: manually inlined sort.Search is ~5% faster.
712 1 : //
713 1 : // Define f(-1) == false and f(n) == true.
714 1 : // Invariant: f(index-1) == false, f(upper) == true.
715 1 : upper := i.numRestarts
716 1 : for index < upper {
717 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
718 1 : // index ≤ h < upper
719 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
720 1 : // For a restart point, there are 0 bytes shared with the previous key.
721 1 : // The varint encoding of 0 occupies 1 byte.
722 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
723 1 :
724 1 : // Decode the key at that restart point, and compare it to the key
725 1 : // sought. See the comment in readEntry for why we manually inline the
726 1 : // varint decoding.
727 1 : var v1 uint32
728 1 : if a := *((*uint8)(ptr)); a < 128 {
729 1 : v1 = uint32(a)
730 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
731 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
732 0 : v1 = uint32(b)<<7 | uint32(a)
733 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
734 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
735 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
736 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
737 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
738 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
739 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
740 0 : } else {
741 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
742 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
743 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
744 0 : }
745 :
746 1 : if *((*uint8)(ptr)) < 128 {
747 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
748 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
749 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
750 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
751 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
752 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
753 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
754 0 : } else {
755 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
756 0 : }
757 :
758 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
759 : // speedup on BlockIter benchmarks.
760 1 : s := getBytes(ptr, int(v1))
761 1 : var k []byte
762 1 : if n := len(s) - 8; n >= 0 {
763 1 : k = s[:n:n]
764 1 : }
765 : // Else k is invalid, and left as nil
766 :
767 1 : if i.cmp(searchKey, k) > 0 {
768 1 : // The search key is greater than the user key at this restart point.
769 1 : // Search beyond this restart point, since we are trying to find the
770 1 : // first restart point with a user key >= the search key.
771 1 : index = h + 1 // preserves f(i-1) == false
772 1 : } else {
773 1 : // k >= search key, so prune everything after index (since index
774 1 : // satisfies the property we are looking for).
775 1 : upper = h // preserves f(j) == true
776 1 : }
777 : }
778 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
779 : // => answer is index.
780 : }
781 :
782 1 : if index == 0 {
783 1 : if i.transforms.SyntheticSuffix.IsSet() {
784 1 : // The binary search was conducted on keys without suffix replacement,
785 1 : // implying the first key in the block may be less than the search key. To
786 1 : // double check, get the first key in the block with suffix replacement
787 1 : // and compare to the search key. Consider the following example: suppose
788 1 : // the user searches with a@3, the first key in the block is a@2 and the
789 1 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
790 1 : // a@2, the binary search would return index==0. Without conducting the
791 1 : // suffix replacement, the SeekLT would incorrectly return nil. With
792 1 : // suffix replacement though, a@4 should be returned as a@4 sorts before
793 1 : // a@3.
794 1 : ikey, lazyVal := i.First()
795 1 : if i.cmp(ikey.UserKey, key) < 0 {
796 1 : return ikey, lazyVal
797 1 : }
798 : }
799 : // If index == 0 then all keys in this block are larger than the key
800 : // sought, so there is no match.
801 1 : i.offset = -1
802 1 : i.nextOffset = 0
803 1 : return nil, base.LazyValue{}
804 : }
805 :
806 : // INVARIANT: index > 0
807 :
808 : // Ignoring suffix replacement, index is the first restart point with key >=
809 : // search key. Define the keys between a restart point and the next restart
810 : // point as belonging to that restart point. Note that index could be equal to
811 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
812 : // increasing, then the restart point at index-1 will be the first one that
813 : // has some keys belonging to it that are less than the search key.
814 : //
815 : // Next, we will search between the restart at index-1 and the restart point
816 : // at index, for the first key >= key, and then on finding it, return
817 : // i.Prev(). We need to know when we have hit the offset for index, since then
818 : // we can stop searching. targetOffset encodes that offset for index.
819 1 : targetOffset := i.restarts
820 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
821 1 : if index < i.numRestarts {
822 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
823 1 :
824 1 : if i.transforms.SyntheticSuffix.IsSet() {
825 1 : // The binary search was conducted on keys without suffix replacement,
826 1 : // implying the returned restart point (index) may be less than the search
827 1 : // key, breaking the assumption described above.
828 1 : //
829 1 : // For example: consider this block with a replacement ts of 4, and
830 1 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
831 1 : // replacement: a@4,b@4,c@4
832 1 : //
833 1 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
834 1 : //
835 1 : // If the client calls SeekLT(b@3), the binary search would return b@2,
836 1 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
837 1 : // begin forward iteration from a@3, the previous restart point, to
838 1 : // b{suffix}. The iteration stops when it encounters a key geq to the
839 1 : // search key or if it reaches the upper bound. Without suffix
840 1 : // replacement, we can assume that the upper bound of this forward
841 1 : // iteration, b{suffix}, is greater than the search key, as implied by the
842 1 : // binary search.
843 1 : //
844 1 : // If we naively hold this assumption with suffix replacement, the
845 1 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
846 1 : // incorrectly return a@4. To correct for this, if the original returned
847 1 : // index is less than the search key, shift our forward iteration to begin
848 1 : // at index instead of index -1. With suffix replacement the key at index
849 1 : // is guaranteed to be the highest restart point less than the seach key
850 1 : // (i.e. the same property of index-1 for a block without suffix
851 1 : // replacement). This property holds because of the invariant that a block
852 1 : // with suffix replacement will not have two keys that share the same
853 1 : // prefix. To consider the above example, binary searching with b@3 landed
854 1 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
855 1 : // begin at b@4. We never need to shift by more than one restart point
856 1 : // (i.e. to c@4) because it's impossible for the search key to be greater
857 1 : // than the the key at the next restart point in the block because that
858 1 : // key will always have a different prefix. Put another way, because no
859 1 : // key in the block shares the same prefix, naive binary search should
860 1 : // always land at most 1 restart point off the correct one.
861 1 :
862 1 : naiveOffset := i.offset
863 1 : // Shift up to the original binary search result and decode the key.
864 1 : i.offset = targetOffset
865 1 : i.readEntry()
866 1 : i.decodeInternalKey(i.key)
867 1 : i.maybeReplaceSuffix()
868 1 :
869 1 : // If the binary search point is actually less than the search key, post
870 1 : // replacement, bump the target offset.
871 1 : if i.cmp(i.ikey.UserKey, key) < 0 {
872 1 : i.offset = targetOffset
873 1 : if index+1 < i.numRestarts {
874 1 : // if index+1 is within the i.data bounds, use it to find the target
875 1 : // offset.
876 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
877 1 : } else {
878 1 : targetOffset = i.restarts
879 1 : }
880 1 : } else {
881 1 : i.offset = naiveOffset
882 1 : }
883 : }
884 : }
885 :
886 : // Init nextOffset for the forward iteration below.
887 1 : i.nextOffset = i.offset
888 1 :
889 1 : for {
890 1 : i.offset = i.nextOffset
891 1 : i.readEntry()
892 1 : // When hidden keys are common, there is additional optimization possible
893 1 : // by not caching entries that are hidden (note that some calls to
894 1 : // cacheEntry don't decode the internal key before caching, but checking
895 1 : // whether a key is hidden does not require full decoding). However, we do
896 1 : // need to use the blockEntry.offset in the cache for the first entry at
897 1 : // the reset point to do the binary search when the cache is empty -- so
898 1 : // we would need to cache that first entry (though not the key) even if
899 1 : // was hidden. Our current assumption is that if there are large numbers
900 1 : // of hidden keys we will be able to skip whole blocks (using block
901 1 : // property filters) so we don't bother optimizing.
902 1 : hiddenPoint := i.decodeInternalKey(i.key)
903 1 : i.maybeReplaceSuffix()
904 1 :
905 1 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
906 1 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
907 1 : // that we can reverse.
908 1 : if i.cmp(i.ikey.UserKey, key) >= 0 {
909 1 : // The current key is greater than or equal to our search key. Back up to
910 1 : // the previous key which was less than our search key. Note that this for
911 1 : // loop will execute at least once with this if-block not being true, so
912 1 : // the key we are backing up to is the last one this loop cached.
913 1 : return i.Prev()
914 1 : }
915 :
916 1 : if i.nextOffset >= targetOffset {
917 1 : // We've reached the end of the current restart block. Return the
918 1 : // current key if not hidden, else call Prev().
919 1 : //
920 1 : // When the restart interval is 1, the first iteration of the for loop
921 1 : // will bring us here. In that case ikey is backed by the block so we
922 1 : // get the desired key stability guarantee for the lifetime of the
923 1 : // blockIter. That is, we never cache anything and therefore never
924 1 : // return a key backed by cachedBuf.
925 1 : if hiddenPoint {
926 1 : return i.Prev()
927 1 : }
928 1 : break
929 : }
930 1 : i.cacheEntry()
931 : }
932 :
933 1 : if !i.valid() {
934 1 : return nil, base.LazyValue{}
935 1 : }
936 1 : if !i.lazyValueHandling.hasValuePrefix ||
937 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
938 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
939 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
940 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
941 1 : } else {
942 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
943 1 : }
944 1 : return &i.ikey, i.lazyValue
945 : }
946 :
947 : // First implements internalIterator.First, as documented in the pebble
948 : // package.
949 1 : func (i *blockIter) First() (*InternalKey, base.LazyValue) {
950 1 : if invariants.Enabled && i.isDataInvalidated() {
951 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
952 : }
953 :
954 1 : i.offset = 0
955 1 : if !i.valid() {
956 1 : return nil, base.LazyValue{}
957 1 : }
958 1 : i.clearCache()
959 1 : i.readEntry()
960 1 : hiddenPoint := i.decodeInternalKey(i.key)
961 1 : if hiddenPoint {
962 1 : return i.Next()
963 1 : }
964 1 : i.maybeReplaceSuffix()
965 1 : if !i.lazyValueHandling.hasValuePrefix ||
966 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
967 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
968 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
969 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
970 1 : } else {
971 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
972 1 : }
973 1 : return &i.ikey, i.lazyValue
974 : }
975 :
976 1 : func decodeRestart(b []byte) int32 {
977 1 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
978 1 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
979 1 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
980 1 : }
981 :
982 : // Last implements internalIterator.Last, as documented in the pebble package.
983 1 : func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
984 1 : if invariants.Enabled && i.isDataInvalidated() {
985 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
986 : }
987 :
988 : // Seek forward from the last restart point.
989 1 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
990 1 : if !i.valid() {
991 1 : return nil, base.LazyValue{}
992 1 : }
993 :
994 1 : i.readEntry()
995 1 : i.clearCache()
996 1 :
997 1 : for i.nextOffset < i.restarts {
998 1 : i.cacheEntry()
999 1 : i.offset = i.nextOffset
1000 1 : i.readEntry()
1001 1 : }
1002 :
1003 1 : hiddenPoint := i.decodeInternalKey(i.key)
1004 1 : if hiddenPoint {
1005 1 : return i.Prev()
1006 1 : }
1007 1 : i.maybeReplaceSuffix()
1008 1 : if !i.lazyValueHandling.hasValuePrefix ||
1009 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1010 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1011 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1012 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1013 1 : } else {
1014 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1015 1 : }
1016 1 : return &i.ikey, i.lazyValue
1017 : }
1018 :
1019 : // Next implements internalIterator.Next, as documented in the pebble
1020 : // package.
1021 1 : func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
1022 1 : if len(i.cachedBuf) > 0 {
1023 1 : // We're switching from reverse iteration to forward iteration. We need to
1024 1 : // populate i.fullKey with the current key we're positioned at so that
1025 1 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1026 1 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1027 1 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1028 1 : // copying into i.fullKey works for both cases.
1029 1 : //
1030 1 : // TODO(peter): Rather than clearing the cache, we could instead use the
1031 1 : // cache until it is exhausted. This would likely be faster than falling
1032 1 : // through to the normal forward iteration code below.
1033 1 : i.fullKey = append(i.fullKey[:0], i.key...)
1034 1 : i.clearCache()
1035 1 : }
1036 :
1037 : start:
1038 1 : i.offset = i.nextOffset
1039 1 : if !i.valid() {
1040 1 : return nil, base.LazyValue{}
1041 1 : }
1042 1 : i.readEntry()
1043 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1044 1 : if n := len(i.key) - 8; n >= 0 {
1045 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1046 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1047 1 : (trailer&trailerObsoleteBit != 0)
1048 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1049 1 : i.ikey.UserKey = i.key[:n:n]
1050 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1051 1 : i.ikey.SetSeqNum(uint64(n))
1052 1 : }
1053 1 : if hiddenPoint {
1054 1 : goto start
1055 : }
1056 1 : if i.transforms.SyntheticSuffix.IsSet() {
1057 1 : // Inlined version of i.maybeReplaceSuffix()
1058 1 : prefixLen := i.split(i.ikey.UserKey)
1059 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1060 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1061 1 : i.ikey.UserKey = i.synthSuffixBuf
1062 1 : }
1063 0 : } else {
1064 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1065 0 : i.ikey.UserKey = nil
1066 0 : }
1067 1 : if !i.lazyValueHandling.hasValuePrefix ||
1068 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1069 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1070 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1071 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1072 1 : } else {
1073 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1074 1 : }
1075 1 : return &i.ikey, i.lazyValue
1076 : }
1077 :
1078 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1079 1 : func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
1080 1 : if i.lazyValueHandling.hasValuePrefix {
1081 1 : return i.nextPrefixV3(succKey)
1082 1 : }
1083 1 : const nextsBeforeSeek = 3
1084 1 : k, v := i.Next()
1085 1 : for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
1086 1 : if j >= nextsBeforeSeek {
1087 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1088 1 : }
1089 1 : k, v = i.Next()
1090 : }
1091 1 : return k, v
1092 : }
1093 :
1094 1 : func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
1095 1 : // Doing nexts that involve a key comparison can be expensive (and the cost
1096 1 : // depends on the key length), so we use the same threshold of 3 that we use
1097 1 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1098 1 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1099 1 : // than doing a SeekGE within the block, so we do this 16 times
1100 1 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1101 1 : // search for the restart consumes > 100ns. If the number of versions is >
1102 1 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1103 1 : // on average need to find a key between two restarts, so another 8 steps
1104 1 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1105 1 : // steps.
1106 1 : //
1107 1 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1108 1 : // was written (which we don't currently store) instead of the default value
1109 1 : // of 16.
1110 1 : const nextCmpThresholdBeforeSeek = 3
1111 1 : const nextFastThresholdBeforeRestarts = 16
1112 1 : nextCmpCount := 0
1113 1 : nextFastCount := 0
1114 1 : usedRestarts := false
1115 1 : // INVARIANT: blockIter is valid.
1116 1 : if invariants.Enabled && !i.valid() {
1117 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1118 : }
1119 1 : prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
1120 1 : for {
1121 1 : i.offset = i.nextOffset
1122 1 : if !i.valid() {
1123 1 : return nil, base.LazyValue{}
1124 1 : }
1125 : // Need to decode the length integers, so we can compute nextOffset.
1126 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1127 1 : // This is an ugly performance hack. Reading entries from blocks is one of
1128 1 : // the inner-most routines and decoding the 3 varints per-entry takes
1129 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1130 1 : // us, so we do it manually. This provides a 10-15% performance improvement
1131 1 : // on blockIter benchmarks on both go1.11 and go1.12.
1132 1 : //
1133 1 : // TODO(peter): remove this hack if go:inline is ever supported.
1134 1 :
1135 1 : // Decode the shared key length integer.
1136 1 : var shared uint32
1137 1 : if a := *((*uint8)(ptr)); a < 128 {
1138 1 : shared = uint32(a)
1139 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1140 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1141 0 : shared = uint32(b)<<7 | uint32(a)
1142 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1143 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1144 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1145 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1146 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1147 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1148 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1149 0 : } else {
1150 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1151 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1152 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1153 0 : }
1154 : // Decode the unshared key length integer.
1155 1 : var unshared uint32
1156 1 : if a := *((*uint8)(ptr)); a < 128 {
1157 1 : unshared = uint32(a)
1158 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1159 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1160 0 : unshared = uint32(b)<<7 | uint32(a)
1161 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1162 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1163 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1164 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1165 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1166 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1167 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1168 0 : } else {
1169 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1170 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1171 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1172 0 : }
1173 : // Decode the value length integer.
1174 1 : var value uint32
1175 1 : if a := *((*uint8)(ptr)); a < 128 {
1176 1 : value = uint32(a)
1177 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1178 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1179 0 : value = uint32(b)<<7 | uint32(a)
1180 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1181 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1182 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1183 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1184 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1185 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1186 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1187 0 : } else {
1188 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1189 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1190 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1191 0 : }
1192 1 : if i.transforms.SyntheticPrefix != nil {
1193 0 : shared += uint32(len(i.transforms.SyntheticPrefix))
1194 0 : }
1195 : // The starting position of the value.
1196 1 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1197 1 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1198 1 : if invariants.Enabled && unshared < 8 {
1199 0 : // This should not happen since only the key prefix is shared, so even
1200 0 : // if the prefix length is the same as the user key length, the unshared
1201 0 : // will include the trailer.
1202 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1203 : }
1204 : // The trailer is written in little endian, so the key kind is the first
1205 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1206 1 : keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1207 1 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1208 1 : prefixChanged := false
1209 1 : if keyKind == InternalKeyKindSet {
1210 1 : if invariants.Enabled && value == 0 {
1211 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1212 : }
1213 1 : valPrefix := *((*valuePrefix)(valuePtr))
1214 1 : if setHasSamePrefix(valPrefix) {
1215 1 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1216 1 : // that subsequent keys will not have a shared length that is greater
1217 1 : // than the prefix of the current key, which is also the prefix of
1218 1 : // i.key. Since we are continuing to iterate, we don't need to
1219 1 : // initialize i.ikey and i.lazyValue (these are initialized before
1220 1 : // returning).
1221 1 : nextFastCount++
1222 1 : if nextFastCount > nextFastThresholdBeforeRestarts {
1223 1 : if usedRestarts {
1224 0 : // Exhausted iteration budget. This will never happen unless
1225 0 : // someone is using a restart interval > 16. It is just to guard
1226 0 : // against long restart intervals causing too much iteration.
1227 0 : break
1228 : }
1229 : // Haven't used restarts yet, so find the first restart at or beyond
1230 : // the current offset.
1231 1 : targetOffset := i.offset
1232 1 : var index int32
1233 1 : {
1234 1 : // NB: manually inlined sort.Sort is ~5% faster.
1235 1 : //
1236 1 : // f defined for a restart point is true iff the offset >=
1237 1 : // targetOffset.
1238 1 : // Define f(-1) == false and f(i.numRestarts) == true.
1239 1 : // Invariant: f(index-1) == false, f(upper) == true.
1240 1 : upper := i.numRestarts
1241 1 : for index < upper {
1242 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1243 1 : // index ≤ h < upper
1244 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1245 1 : if offset < targetOffset {
1246 1 : index = h + 1 // preserves f(index-1) == false
1247 1 : } else {
1248 1 : upper = h // preserves f(upper) == true
1249 1 : }
1250 : }
1251 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1252 : // => answer is index.
1253 : }
1254 1 : usedRestarts = true
1255 1 : nextFastCount = 0
1256 1 : if index == i.numRestarts {
1257 1 : // Already past the last real restart, so iterate a bit more until
1258 1 : // we are done with the block.
1259 1 : continue
1260 : }
1261 : // Have some real restarts after index. NB: index is the first
1262 : // restart at or beyond the current offset.
1263 1 : startingIndex := index
1264 1 : for index != i.numRestarts &&
1265 1 : // The restart at index is 4 bytes written in little endian format
1266 1 : // starting at i.restart+4*index. The 0th byte is the least
1267 1 : // significant and the 3rd byte is the most significant. Since the
1268 1 : // most significant bit of the 3rd byte is what we use for
1269 1 : // encoding the set-has-same-prefix information, the indexing
1270 1 : // below has +3.
1271 1 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1272 1 : // We still have the same prefix, so move to the next restart.
1273 1 : index++
1274 1 : }
1275 : // index is the first restart that did not have the same prefix.
1276 1 : if index != startingIndex {
1277 1 : // Managed to skip past at least one restart. Resume iteration
1278 1 : // from index-1. Since nextFastCount has been reset to 0, we
1279 1 : // should be able to iterate to the next prefix.
1280 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1281 1 : i.readEntry()
1282 1 : }
1283 : // Else, unable to skip past any restart. Resume iteration. Since
1284 : // nextFastCount has been reset to 0, we should be able to iterate
1285 : // to the next prefix.
1286 1 : continue
1287 : }
1288 1 : continue
1289 1 : } else if prevKeyIsSet {
1290 1 : prefixChanged = true
1291 1 : }
1292 1 : } else {
1293 1 : prevKeyIsSet = false
1294 1 : }
1295 : // Slow-path cases:
1296 : // - (Likely) The prefix has changed.
1297 : // - (Unlikely) The prefix has not changed.
1298 : // We assemble the key etc. under the assumption that it is the likely
1299 : // case.
1300 1 : unsharedKey := getBytes(ptr, int(unshared))
1301 1 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1302 1 : // since the current logic assumes we have always copied the latest key
1303 1 : // into fullKey, which is why when we get to the next key we can (a)
1304 1 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1305 1 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1306 1 : // valid (even if unshared). For (b), we will need to remember whether
1307 1 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1308 1 : // in the former case and for the latter case need to copy the shared part
1309 1 : // too. This same comment applies to the other place where we can do this
1310 1 : // optimization, in readEntry().
1311 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1312 1 : i.val = getBytes(valuePtr, int(value))
1313 1 : if shared == 0 {
1314 1 : // Provide stability for the key across positioning calls if the key
1315 1 : // doesn't share a prefix with the previous key. This removes requiring the
1316 1 : // key to be copied if the caller knows the block has a restart interval of
1317 1 : // 1. An important example of this is range-del blocks.
1318 1 : i.key = unsharedKey
1319 1 : } else {
1320 1 : i.key = i.fullKey
1321 1 : }
1322 : // Manually inlined version of i.decodeInternalKey(i.key).
1323 1 : hiddenPoint := false
1324 1 : if n := len(i.key) - 8; n >= 0 {
1325 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1326 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
1327 1 : (trailer&trailerObsoleteBit != 0)
1328 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1329 1 : i.ikey.UserKey = i.key[:n:n]
1330 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1331 0 : i.ikey.SetSeqNum(uint64(n))
1332 0 : }
1333 1 : if i.transforms.SyntheticSuffix.IsSet() {
1334 0 : // Inlined version of i.maybeReplaceSuffix()
1335 0 : prefixLen := i.split(i.ikey.UserKey)
1336 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1337 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1338 0 : i.ikey.UserKey = i.synthSuffixBuf
1339 0 : }
1340 0 : } else {
1341 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1342 0 : i.ikey.UserKey = nil
1343 0 : }
1344 1 : nextCmpCount++
1345 1 : if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
1346 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1347 0 : i.ikey.UserKey, succKey))
1348 : }
1349 1 : if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
1350 1 : // Prefix has changed.
1351 1 : if hiddenPoint {
1352 0 : return i.Next()
1353 0 : }
1354 1 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1355 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1356 : }
1357 1 : if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1358 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1359 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1360 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1361 1 : } else {
1362 0 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1363 0 : }
1364 1 : return &i.ikey, i.lazyValue
1365 : }
1366 : // Else prefix has not changed.
1367 :
1368 1 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1369 1 : break
1370 : }
1371 : }
1372 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1373 : }
1374 :
1375 : // Prev implements internalIterator.Prev, as documented in the pebble
1376 : // package.
1377 1 : func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
1378 1 : start:
1379 1 : for n := len(i.cached) - 1; n >= 0; n-- {
1380 1 : i.nextOffset = i.offset
1381 1 : e := &i.cached[n]
1382 1 : i.offset = e.offset
1383 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1384 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1385 1 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1386 1 : if n := len(i.key) - 8; n >= 0 {
1387 1 : trailer := binary.LittleEndian.Uint64(i.key[n:])
1388 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1389 1 : (trailer&trailerObsoleteBit != 0)
1390 1 : if hiddenPoint {
1391 1 : continue
1392 : }
1393 1 : i.ikey.Trailer = trailer & trailerObsoleteMask
1394 1 : i.ikey.UserKey = i.key[:n:n]
1395 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1396 1 : i.ikey.SetSeqNum(uint64(n))
1397 1 : }
1398 1 : if i.transforms.SyntheticSuffix.IsSet() {
1399 1 : // Inlined version of i.maybeReplaceSuffix()
1400 1 : prefixLen := i.split(i.ikey.UserKey)
1401 1 : // If ikey is cached or may get cached, we must de-reference
1402 1 : // UserKey before suffix replacement.
1403 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1404 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1405 1 : i.ikey.UserKey = i.synthSuffixBuf
1406 1 : }
1407 0 : } else {
1408 0 : i.ikey.Trailer = uint64(InternalKeyKindInvalid)
1409 0 : i.ikey.UserKey = nil
1410 0 : }
1411 1 : i.cached = i.cached[:n]
1412 1 : if !i.lazyValueHandling.hasValuePrefix ||
1413 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1414 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1415 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1416 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1417 1 : } else {
1418 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1419 1 : }
1420 1 : return &i.ikey, i.lazyValue
1421 : }
1422 :
1423 1 : i.clearCache()
1424 1 : if i.offset <= 0 {
1425 1 : i.offset = -1
1426 1 : i.nextOffset = 0
1427 1 : return nil, base.LazyValue{}
1428 1 : }
1429 :
1430 1 : targetOffset := i.offset
1431 1 : var index int32
1432 1 :
1433 1 : {
1434 1 : // NB: manually inlined sort.Sort is ~5% faster.
1435 1 : //
1436 1 : // Define f(-1) == false and f(n) == true.
1437 1 : // Invariant: f(index-1) == false, f(upper) == true.
1438 1 : upper := i.numRestarts
1439 1 : for index < upper {
1440 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1441 1 : // index ≤ h < upper
1442 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1443 1 : if offset < targetOffset {
1444 1 : // Looking for the first restart that has offset >= targetOffset, so
1445 1 : // ignore h and earlier.
1446 1 : index = h + 1 // preserves f(i-1) == false
1447 1 : } else {
1448 1 : upper = h // preserves f(j) == true
1449 1 : }
1450 : }
1451 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1452 : // => answer is index.
1453 : }
1454 :
1455 : // index is first restart with offset >= targetOffset. Note that
1456 : // targetOffset may not be at a restart point since one can call Prev()
1457 : // after Next() (so the cache was not populated) and targetOffset refers to
1458 : // the current entry. index-1 must have an offset < targetOffset (it can't
1459 : // be equal to targetOffset since the binary search would have selected that
1460 : // as the index).
1461 1 : i.offset = 0
1462 1 : if index > 0 {
1463 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1464 1 : }
1465 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1466 : // valid offset.
1467 :
1468 1 : i.readEntry()
1469 1 :
1470 1 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1471 1 : // entry we are stepping back from, and we don't need to cache the entry
1472 1 : // before it, since it is the candidate to return.
1473 1 : for i.nextOffset < targetOffset {
1474 1 : i.cacheEntry()
1475 1 : i.offset = i.nextOffset
1476 1 : i.readEntry()
1477 1 : }
1478 :
1479 1 : hiddenPoint := i.decodeInternalKey(i.key)
1480 1 : if hiddenPoint {
1481 1 : // Use the cache.
1482 1 : goto start
1483 : }
1484 1 : if i.transforms.SyntheticSuffix.IsSet() {
1485 1 : // Inlined version of i.maybeReplaceSuffix()
1486 1 : prefixLen := i.split(i.ikey.UserKey)
1487 1 : // If ikey is cached or may get cached, we must de-reference
1488 1 : // UserKey before suffix replacement.
1489 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...)
1490 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1491 1 : i.ikey.UserKey = i.synthSuffixBuf
1492 1 : }
1493 1 : if !i.lazyValueHandling.hasValuePrefix ||
1494 1 : base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
1495 1 : i.lazyValue = base.MakeInPlaceValue(i.val)
1496 1 : } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
1497 1 : i.lazyValue = base.MakeInPlaceValue(i.val[1:])
1498 1 : } else {
1499 1 : i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
1500 1 : }
1501 1 : return &i.ikey, i.lazyValue
1502 : }
1503 :
1504 : // Key implements internalIterator.Key, as documented in the pebble package.
1505 1 : func (i *blockIter) Key() *InternalKey {
1506 1 : return &i.ikey
1507 1 : }
1508 :
1509 1 : func (i *blockIter) value() base.LazyValue {
1510 1 : return i.lazyValue
1511 1 : }
1512 :
1513 : // Error implements internalIterator.Error, as documented in the pebble
1514 : // package.
1515 1 : func (i *blockIter) Error() error {
1516 1 : return nil // infallible
1517 1 : }
1518 :
1519 : // Close implements internalIterator.Close, as documented in the pebble
1520 : // package.
1521 1 : func (i *blockIter) Close() error {
1522 1 : i.handle.Release()
1523 1 : i.handle = bufferHandle{}
1524 1 : i.val = nil
1525 1 : i.lazyValue = base.LazyValue{}
1526 1 : i.lazyValueHandling.vbr = nil
1527 1 : return nil
1528 1 : }
1529 :
1530 0 : func (i *blockIter) SetBounds(lower, upper []byte) {
1531 0 : // This should never be called as bounds are handled by sstable.Iterator.
1532 0 : panic("pebble: SetBounds unimplemented")
1533 : }
1534 :
1535 0 : func (i *blockIter) SetContext(_ context.Context) {}
1536 :
1537 1 : func (i *blockIter) valid() bool {
1538 1 : return i.offset >= 0 && i.offset < i.restarts
1539 1 : }
|