Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package rowblk
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "io"
12 : "slices"
13 : "sort"
14 : "unsafe"
15 :
16 : "github.com/cockroachdb/errors"
17 : "github.com/cockroachdb/pebble/internal/base"
18 : "github.com/cockroachdb/pebble/internal/invariants"
19 : "github.com/cockroachdb/pebble/internal/manual"
20 : "github.com/cockroachdb/pebble/internal/treeprinter"
21 : "github.com/cockroachdb/pebble/sstable/block"
22 : )
23 :
24 : // Iter is an iterator over a single block of data.
25 : //
26 : // An Iter provides an additional guarantee around key stability when a block
27 : // has a restart interval of 1 (i.e. when there is no prefix compression). Key
28 : // stability refers to whether the InternalKey.UserKey bytes returned by a
29 : // positioning call will remain stable after a subsequent positioning call. The
30 : // normal case is that a positioning call will invalidate any previously
31 : // returned InternalKey.UserKey. If a block has a restart interval of 1 (no
32 : // prefix compression), Iter guarantees that InternalKey.UserKey will point to
33 : // the key as stored in the block itself which will remain valid until the Iter
34 : // is closed. The key stability guarantee is used by the range tombstone and
35 : // range key code, which knows that the respective blocks are always encoded
36 : // with a restart interval of 1. This per-block key stability guarantee is
37 : // sufficient for range tombstones and range deletes as they are always encoded
38 : // in a single block. Note: this stability guarantee no longer holds for a block
39 : // iter with synthetic prefix/suffix replacement, but we don't use the synthetic
40 : // suffix/prefix functionality of Iter for range keys.
41 : //
42 : // An Iter also provides a value stability guarantee for range deletions and
43 : // range keys since there is only a single range deletion and range key block
44 : // per sstable and the Iter will not release the bytes for the block until it is
45 : // closed.
46 : //
47 : // Note on why Iter knows about lazyValueHandling:
48 : //
49 : // Iter's positioning functions (that return a LazyValue), are too
50 : // complex to inline even prior to lazyValueHandling. Iter.Next and
51 : // Iter.First were by far the cheapest and had costs 195 and 180
52 : // respectively, which exceeds the budget of 80. We initially tried to keep
53 : // the lazyValueHandling logic out of Iter by wrapping it with a
54 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
55 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
56 : // they called the corresponding Iter func and then decided whether the
57 : // value was in fact in-place (so return immediately) or needed further
58 : // handling. But these also turned out too costly for mid-stack inlining since
59 : // simple calls like the following have a high cost that is barely under the
60 : // budget of 80
61 : //
62 : // k, v := i.data.SeekGE(key, flags) // cost 74
63 : // k, v := i.data.Next() // cost 72
64 : //
65 : // We have 2 options for minimizing performance regressions:
66 : // - Include the lazyValueHandling logic in the already non-inlineable
67 : // Iter functions: Since most of the time is spent in data block iters,
68 : // it is acceptable to take the small hit of unnecessary branching (which
69 : // hopefully branch prediction will predict correctly) for other kinds of
70 : // blocks.
71 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
72 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
73 : // v3 sstable. We would want to manage these copies via code generation.
74 : //
75 : // We have picked the first option here.
76 : type Iter struct {
77 : cmp base.Compare
78 : split base.Split
79 :
80 : // Iterator transforms.
81 : //
82 : // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
83 : // before the key is returned to the user. A sequence of iter operations on a
84 : // block with a syntheticSuffix rule should return keys as if those operations
85 : // ran on a block with keys that all had the syntheticSuffix. As an example:
86 : // any sequence of block iter cmds should return the same keys for the
87 : // following two blocks:
88 : //
89 : // blockA: a@3,b@3,c@3
90 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
91 : //
92 : // To ensure this, Suffix replacement will not change the ordering of keys in
93 : // the block because the iter assumes that no two keys in the block share the
94 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
95 : // iterator handles "off by one" errors (explained in more detail in those
96 : // functions) when, for a given key, originalSuffix < searchSuffix <
97 : // replacementSuffix, with integer comparison. To handle these cases, the
98 : // iterator assumes:
99 : //
100 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
101 : // for keys with a suffix.
102 : //
103 : // NB: it is possible for a block iter to add a synthetic suffix on a key
104 : // without a suffix, which implies
105 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
106 : // however, the iterator would never need to handle an off by one error in
107 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
108 : // integer comparison.
109 : //
110 : //
111 : // In addition, we also assume that any block with rangekeys will not contain
112 : // a synthetic suffix.
113 : transforms block.IterTransforms
114 :
115 : // offset is the byte index that marks where the current key/value is
116 : // encoded in the block.
117 : offset int32
118 : // nextOffset is the byte index where the next key/value is encoded in the
119 : // block.
120 : nextOffset int32
121 : // A "restart point" in a block is a point where the full key is encoded,
122 : // instead of just having a suffix of the key encoded. See readEntry() for
123 : // how prefix compression of keys works. Keys in between two restart points
124 : // only have a suffix encoded in the block. When restart interval is 1, no
125 : // prefix compression of keys happens. This is the case with range tombstone
126 : // blocks.
127 : //
128 : // All restart offsets are listed in increasing order in
129 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
130 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
131 : // therefore be seen as the point where data in the block ends, and a list
132 : // of offsets of all restart points begins.
133 : restarts int32
134 : // Number of restart points in this block. Encoded at the end of the block
135 : // as a uint32.
136 : numRestarts int32
137 : ptr unsafe.Pointer
138 : data []byte
139 : // key contains the raw key the iterator is currently pointed at. This may
140 : // point directly to data stored in the block (for a key which has no prefix
141 : // compression), to fullKey (for a prefix compressed key), or to a slice of
142 : // data stored in cachedBuf (during reverse iteration).
143 : //
144 : // NB: In general, key contains the same logical content as ikey
145 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
146 : // replacement rule, this will not be the case. Therefore, key should never
147 : // be used after ikey is set.
148 : key []byte
149 : // fullKey is a buffer used for key prefix decompression. Note that if
150 : // transforms.SyntheticPrifix is not nil, fullKey always starts with that
151 : // prefix.
152 : fullKey []byte
153 : // val contains the value the iterator is currently pointed at. If non-nil,
154 : // this points to a slice of the block data.
155 : val []byte
156 : // ikv contains the decoded internal KV the iterator is currently positioned
157 : // at.
158 : //
159 : // ikv.InternalKey contains the decoded InternalKey the iterator is
160 : // currently pointed at. Note that the memory backing ikv.UserKey is either
161 : // data stored directly in the block, fullKey, or cachedBuf. The key
162 : // stability guarantee for blocks built with a restart interval of 1 is
163 : // achieved by having ikv.UserKey always point to data stored directly in
164 : // the block.
165 : //
166 : // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
167 : // method returns a non-nil key-value pair.
168 : ikv base.InternalKV
169 : // cached and cachedBuf are used during reverse iteration. They are needed
170 : // because we can't perform prefix decoding in reverse, only in the forward
171 : // direction. In order to iterate in reverse, we decode and cache the entries
172 : // between two restart points.
173 : //
174 : // Note that cached[len(cached)-1] contains the previous entry to the one the
175 : // blockIter is currently pointed at. As usual, nextOffset will contain the
176 : // offset of the next entry. During reverse iteration, nextOffset will be
177 : // updated to point to offset, and we'll set the blockIter to point at the
178 : // entry cached[len(cached)-1]. See Prev() for more details.
179 : //
180 : // For a block encoded with a restart interval of 1, cached and cachedBuf
181 : // will not be used as there are no prefix compressed entries between the
182 : // restart points.
183 : cached []blockEntry
184 : cachedBuf []byte
185 : handle block.BufferHandle
186 : // for block iteration for already loaded blocks.
187 : firstUserKey []byte
188 : lazyValueHandling struct {
189 : getValue block.GetLazyValueForPrefixAndValueHandler
190 : hasValuePrefix bool
191 : }
192 : synthSuffixBuf []byte
193 : firstUserKeyWithPrefixBuf []byte
194 : }
195 :
196 : type blockEntry struct {
197 : offset int32
198 : keyStart int32
199 : keyEnd int32
200 : valStart int32
201 : valSize int32
202 : }
203 :
204 : // *Iter implements the block.DataBlockIterator interface.
205 : var _ block.DataBlockIterator = (*Iter)(nil)
206 :
207 : // NewIter constructs a new row-oriented block iterator over the provided serialized block.
208 : func NewIter(
209 : cmp base.Compare,
210 : suffixCmp base.ComparePointSuffixes,
211 : split base.Split,
212 : block []byte,
213 : transforms block.IterTransforms,
214 1 : ) (*Iter, error) {
215 1 : i := &Iter{}
216 1 : return i, i.Init(cmp, suffixCmp, split, block, transforms)
217 1 : }
218 :
219 : // String implements fmt.Stringer.
220 0 : func (i *Iter) String() string {
221 0 : return "block"
222 0 : }
223 :
224 : // Init initializes the block iterator from the provided block.
225 : func (i *Iter) Init(
226 : cmp base.Compare,
227 : suffixCmp base.ComparePointSuffixes,
228 : split base.Split,
229 : blk []byte,
230 : transforms block.IterTransforms,
231 2 : ) error {
232 2 : numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
233 2 : if numRestarts == 0 {
234 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
235 0 : }
236 2 : i.transforms = transforms
237 2 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
238 2 : i.split = split
239 2 : i.cmp = cmp
240 2 : i.restarts = int32(len(blk)) - 4*(1+numRestarts)
241 2 : i.numRestarts = numRestarts
242 2 : i.ptr = unsafe.Pointer(&blk[0])
243 2 : i.data = blk
244 2 : if i.transforms.HasSyntheticPrefix() {
245 2 : i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix()...)
246 2 : } else {
247 2 : i.fullKey = i.fullKey[:0]
248 2 : }
249 2 : i.val = nil
250 2 : i.clearCache()
251 2 : if i.restarts > 0 {
252 2 : if err := i.readFirstKey(); err != nil {
253 0 : return err
254 0 : }
255 2 : } else {
256 2 : // Block is empty.
257 2 : i.firstUserKey = nil
258 2 : }
259 2 : return nil
260 : }
261 :
262 : // InitHandle initializes an iterator from the provided block handle.
263 : // NB: two cases of hideObsoletePoints:
264 : // - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
265 : // ingested.
266 : // - Foreign sstable iteration: syntheticSeqNum is always set.
267 : func (i *Iter) InitHandle(
268 : comparer *base.Comparer, block block.BufferHandle, transforms block.IterTransforms,
269 2 : ) error {
270 2 : i.handle.Release()
271 2 : i.handle = block
272 2 : return i.Init(
273 2 : comparer.Compare,
274 2 : comparer.ComparePointSuffixes,
275 2 : comparer.Split,
276 2 : block.BlockData(),
277 2 : transforms)
278 2 : }
279 :
280 : // SetHasValuePrefix sets whether or not the block iterator should expect values
281 : // corresponding to Set keys to have a prefix byte.
282 2 : func (i *Iter) SetHasValuePrefix(hasValuePrefix bool) {
283 2 : i.lazyValueHandling.hasValuePrefix = hasValuePrefix
284 2 : }
285 :
286 : // SetGetLazyValuer sets the value block reader the iterator should use to get
287 : // lazy values when the value encodes a value prefix.
288 2 : func (i *Iter) SetGetLazyValuer(g block.GetLazyValueForPrefixAndValueHandler) {
289 2 : i.lazyValueHandling.getValue = g
290 2 :
291 2 : }
292 :
293 : // Handle returns the underlying block buffer handle, if the iterator was
294 : // initialized with one.
295 2 : func (i *Iter) Handle() block.BufferHandle {
296 2 : return i.handle
297 2 : }
298 :
299 : // Invalidate invalidates the block iterator, removing references to the block
300 : // it was initialized with.
301 2 : func (i *Iter) Invalidate() {
302 2 : i.clearCache()
303 2 : i.offset = 0
304 2 : i.nextOffset = 0
305 2 : i.restarts = 0
306 2 : i.numRestarts = 0
307 2 : i.data = nil
308 2 : }
309 :
310 : // IsDataInvalidated returns true when the blockIter has been invalidated
311 : // using an invalidate call. NB: this is different from blockIter.Valid
312 : // which is part of the InternalIterator implementation.
313 2 : func (i *Iter) IsDataInvalidated() bool {
314 2 : return i.data == nil
315 2 : }
316 :
317 2 : func (i *Iter) readEntry() {
318 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
319 2 :
320 2 : // This is an ugly performance hack. Reading entries from blocks is one of
321 2 : // the inner-most routines and decoding the 3 varints per-entry takes
322 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
323 2 : // us, so we do it manually. This provides a 10-15% performance improvement
324 2 : // on blockIter benchmarks on both go1.11 and go1.12.
325 2 : //
326 2 : // TODO(peter): remove this hack if go:inline is ever supported.
327 2 :
328 2 : var shared uint32
329 2 : if a := *((*uint8)(ptr)); a < 128 {
330 2 : shared = uint32(a)
331 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
332 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
333 0 : shared = uint32(b)<<7 | uint32(a)
334 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
335 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
336 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
337 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
338 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
339 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
340 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
341 0 : } else {
342 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
343 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
344 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
345 0 : }
346 :
347 2 : var unshared uint32
348 2 : if a := *((*uint8)(ptr)); a < 128 {
349 2 : unshared = uint32(a)
350 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
351 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
352 0 : unshared = uint32(b)<<7 | uint32(a)
353 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
354 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
355 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
356 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
357 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
358 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
359 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
360 0 : } else {
361 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
362 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
363 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
364 0 : }
365 :
366 2 : var value uint32
367 2 : if a := *((*uint8)(ptr)); a < 128 {
368 2 : value = uint32(a)
369 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
370 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
371 2 : value = uint32(b)<<7 | uint32(a)
372 2 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
373 2 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
374 1 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
375 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
376 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
377 1 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
378 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
379 1 : } else {
380 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
381 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
382 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
383 0 : }
384 2 : shared += i.transforms.SyntheticPrefixAndSuffix.PrefixLen()
385 2 : unsharedKey := getBytes(ptr, int(unshared))
386 2 : // TODO(sumeer): move this into the else block below.
387 2 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
388 2 : if shared == 0 {
389 2 : // Provide stability for the key across positioning calls if the key
390 2 : // doesn't share a prefix with the previous key. This removes requiring the
391 2 : // key to be copied if the caller knows the block has a restart interval of
392 2 : // 1. An important example of this is range-del blocks.
393 2 : i.key = unsharedKey
394 2 : } else {
395 2 : i.key = i.fullKey
396 2 : }
397 2 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
398 2 : i.val = getBytes(ptr, int(value))
399 2 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
400 : }
401 :
402 2 : func (i *Iter) readFirstKey() error {
403 2 : ptr := i.ptr
404 2 :
405 2 : // This is an ugly performance hack. Reading entries from blocks is one of
406 2 : // the inner-most routines and decoding the 3 varints per-entry takes
407 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
408 2 : // us, so we do it manually. This provides a 10-15% performance improvement
409 2 : // on blockIter benchmarks on both go1.11 and go1.12.
410 2 : //
411 2 : // TODO(peter): remove this hack if go:inline is ever supported.
412 2 :
413 2 : if shared := *((*uint8)(ptr)); shared == 0 {
414 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
415 2 : } else {
416 0 : // The shared length is != 0, which is invalid.
417 0 : panic("first key in block must have zero shared length")
418 : }
419 :
420 2 : var unshared uint32
421 2 : if a := *((*uint8)(ptr)); a < 128 {
422 2 : unshared = uint32(a)
423 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
424 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
425 0 : unshared = uint32(b)<<7 | uint32(a)
426 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
427 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
428 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
429 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
430 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
431 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
432 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
433 0 : } else {
434 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
435 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
436 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
437 0 : }
438 :
439 : // Skip the value length.
440 2 : if a := *((*uint8)(ptr)); a < 128 {
441 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
442 2 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
443 2 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
444 2 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
445 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
446 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
447 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
448 1 : } else {
449 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
450 0 : }
451 :
452 2 : firstKey := getBytes(ptr, int(unshared))
453 2 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
454 2 : // BlockIter benchmarks.
455 2 : if n := len(firstKey) - 8; n >= 0 {
456 2 : i.firstUserKey = firstKey[:n:n]
457 2 : } else {
458 0 : i.firstUserKey = nil
459 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
460 0 : }
461 2 : if i.transforms.HasSyntheticPrefix() {
462 2 : syntheticPrefix := i.transforms.SyntheticPrefix()
463 2 : i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(syntheticPrefix)+len(i.firstUserKey))
464 2 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, syntheticPrefix...)
465 2 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
466 2 : i.firstUserKey = i.firstUserKeyWithPrefixBuf
467 2 : }
468 2 : return nil
469 : }
470 :
471 2 : func (i *Iter) decodeInternalKey(key []byte) (hiddenPoint bool) {
472 2 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
473 2 : // BlockIter benchmarks.
474 2 : if n := len(key) - 8; n >= 0 {
475 2 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(key[n:]))
476 2 : hiddenPoint = i.transforms.HideObsoletePoints &&
477 2 : (trailer&TrailerObsoleteBit != 0)
478 2 : i.ikv.K.Trailer = trailer & TrailerObsoleteMask
479 2 : i.ikv.K.UserKey = key[:n:n]
480 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
481 2 : i.ikv.K.SetSeqNum(base.SeqNum(n))
482 2 : }
483 2 : } else {
484 2 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
485 2 : i.ikv.K.UserKey = nil
486 2 : }
487 2 : return hiddenPoint
488 : }
489 :
490 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
491 : // i.transforms.syntheticSuffix.
492 2 : func (i *Iter) maybeReplaceSuffix() {
493 2 : if i.transforms.HasSyntheticSuffix() && i.ikv.K.UserKey != nil {
494 2 : prefixLen := i.split(i.ikv.K.UserKey)
495 2 : // If ikey is cached or may get cached, we must copy
496 2 : // UserKey to a new buffer before suffix replacement.
497 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
498 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...)
499 2 : i.ikv.K.UserKey = i.synthSuffixBuf
500 2 : }
501 : }
502 :
503 2 : func (i *Iter) clearCache() {
504 2 : i.cached = i.cached[:0]
505 2 : i.cachedBuf = i.cachedBuf[:0]
506 2 : }
507 :
508 2 : func (i *Iter) cacheEntry() {
509 2 : var valStart int32
510 2 : valSize := int32(len(i.val))
511 2 : if valSize > 0 {
512 2 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
513 2 : }
514 :
515 2 : i.cached = append(i.cached, blockEntry{
516 2 : offset: i.offset,
517 2 : keyStart: int32(len(i.cachedBuf)),
518 2 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
519 2 : valStart: valStart,
520 2 : valSize: valSize,
521 2 : })
522 2 : i.cachedBuf = append(i.cachedBuf, i.key...)
523 : }
524 :
525 : // IsLowerBound implements the block.DataBlockIterator interface.
526 2 : func (i *Iter) IsLowerBound(k []byte) bool {
527 2 : // Note: we ignore HideObsoletePoints, but false negatives are allowed.
528 2 : return i.cmp(i.firstUserKey, k) >= 0
529 2 : }
530 :
531 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
532 : // package.
533 2 : func (i *Iter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
534 2 : if invariants.Enabled && i.IsDataInvalidated() {
535 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
536 : }
537 2 : searchKey := key
538 2 : if i.transforms.HasSyntheticPrefix() {
539 2 : syntheticPrefix := i.transforms.SyntheticPrefix()
540 2 : if !bytes.HasPrefix(key, syntheticPrefix) {
541 1 : // The seek key is before or after the entire block of keys that start
542 1 : // with SyntheticPrefix. To determine which, we need to compare against a
543 1 : // valid key in the block. We use firstUserKey which has the synthetic
544 1 : // prefix.
545 1 : if i.cmp(i.firstUserKey, key) >= 0 {
546 1 : return i.First()
547 1 : }
548 : // Set the offset to the end of the block to mimic the offset of an
549 : // invalid iterator. This ensures a subsequent i.Prev() returns a valid
550 : // result.
551 1 : i.offset = i.restarts
552 1 : i.nextOffset = i.restarts
553 1 : return nil
554 : }
555 2 : searchKey = key[len(syntheticPrefix):]
556 : }
557 :
558 2 : i.clearCache()
559 2 : // Find the index of the smallest restart point whose key is > the key
560 2 : // sought; index will be numRestarts if there is no such restart point.
561 2 : i.offset = 0
562 2 : var index int32
563 2 :
564 2 : {
565 2 : // NB: manually inlined sort.Seach is ~5% faster.
566 2 : //
567 2 : // Define f(-1) == false and f(n) == true.
568 2 : // Invariant: f(index-1) == false, f(upper) == true.
569 2 : upper := i.numRestarts
570 2 : for index < upper {
571 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
572 2 : // index ≤ h < upper
573 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
574 2 : // For a restart point, there are 0 bytes shared with the previous key.
575 2 : // The varint encoding of 0 occupies 1 byte.
576 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
577 2 :
578 2 : // Decode the key at that restart point, and compare it to the key
579 2 : // sought. See the comment in readEntry for why we manually inline the
580 2 : // varint decoding.
581 2 : var v1 uint32
582 2 : if a := *((*uint8)(ptr)); a < 128 {
583 2 : v1 = uint32(a)
584 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
585 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
586 0 : v1 = uint32(b)<<7 | uint32(a)
587 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
588 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
589 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
590 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
591 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
592 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
593 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
594 0 : } else {
595 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
596 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
597 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
598 0 : }
599 :
600 2 : if *((*uint8)(ptr)) < 128 {
601 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
602 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
603 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
604 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
605 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
606 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
607 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
608 0 : } else {
609 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
610 0 : }
611 :
612 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
613 : // speedup on BlockIter benchmarks.
614 2 : s := getBytes(ptr, int(v1))
615 2 : var k []byte
616 2 : if n := len(s) - 8; n >= 0 {
617 2 : k = s[:n:n]
618 2 : }
619 : // Else k is invalid, and left as nil
620 :
621 2 : if i.cmp(searchKey, k) > 0 {
622 2 : // The search key is greater than the user key at this restart point.
623 2 : // Search beyond this restart point, since we are trying to find the
624 2 : // first restart point with a user key >= the search key.
625 2 : index = h + 1 // preserves f(i-1) == false
626 2 : } else {
627 2 : // k >= search key, so prune everything after index (since index
628 2 : // satisfies the property we are looking for).
629 2 : upper = h // preserves f(j) == true
630 2 : }
631 : }
632 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
633 : // => answer is index.
634 : }
635 :
636 : // index is the first restart point with key >= search key. Define the keys
637 : // between a restart point and the next restart point as belonging to that
638 : // restart point.
639 : //
640 : // Since keys are strictly increasing, if index > 0 then the restart point
641 : // at index-1 will be the first one that has some keys belonging to it that
642 : // could be equal to the search key. If index == 0, then all keys in this
643 : // block are larger than the key sought, and offset remains at zero.
644 2 : if index > 0 {
645 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
646 2 : }
647 2 : i.readEntry()
648 2 : hiddenPoint := i.decodeInternalKey(i.key)
649 2 :
650 2 : // Iterate from that restart point to somewhere >= the key sought.
651 2 : if !i.Valid() {
652 1 : return nil
653 1 : }
654 :
655 : // A note on seeking in a block with a suffix replacement rule: even though
656 : // the binary search above was conducted on keys without suffix replacement,
657 : // Seek will still return the correct suffix replaced key. A binary
658 : // search without suffix replacement will land on a key that is _less_ than
659 : // the key the search would have landed on if all keys were already suffix
660 : // replaced. Since Seek then conducts forward iteration to the first suffix
661 : // replaced user key that is greater than or equal to the search key, the
662 : // correct key is still returned.
663 : //
664 : // As an example, consider the following block with a restart interval of 1,
665 : // with a replacement suffix of "4":
666 : // - Pre-suffix replacement: apple@1, banana@3
667 : // - Post-suffix replacement: apple@4, banana@4
668 : //
669 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
670 : // chronological order (i.e. apple@1>apple@3), the binary search without
671 : // suffix replacement would return apple@1. A binary search with suffix
672 : // replacement would return banana@4. After beginning forward iteration from
673 : // either returned restart point, forward iteration would
674 : // always return the correct key, banana@4.
675 : //
676 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
677 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
678 : // replacement suffix), the binary search with or without suffix replacement
679 : // would land on the same key, as we assume the following:
680 : // (1) no two keys in the sst share the same prefix.
681 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
682 :
683 2 : i.maybeReplaceSuffix()
684 2 :
685 2 : if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
686 2 : // Initialize i.lazyValue
687 2 : if !i.lazyValueHandling.hasValuePrefix ||
688 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
689 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
690 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
691 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
692 2 : } else {
693 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
694 2 : }
695 2 : return &i.ikv
696 : }
697 2 : for i.Next(); i.Valid(); i.Next() {
698 2 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
699 2 : // i.Next() has already initialized i.ikv.LazyValue.
700 2 : return &i.ikv
701 2 : }
702 : }
703 2 : return nil
704 : }
705 :
706 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
707 : // pebble package.
708 0 : func (i *Iter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
709 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
710 0 : panic("pebble: SeekPrefixGE unimplemented")
711 : }
712 :
713 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
714 : // package.
715 2 : func (i *Iter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
716 2 : if invariants.Enabled && i.IsDataInvalidated() {
717 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
718 : }
719 2 : searchKey := key
720 2 : if i.transforms.HasSyntheticPrefix() {
721 2 : syntheticPrefix := i.transforms.SyntheticPrefix()
722 2 : if !bytes.HasPrefix(key, syntheticPrefix) {
723 2 : // The seek key is before or after the entire block of keys that start
724 2 : // with SyntheticPrefix. To determine which, we need to compare against a
725 2 : // valid key in the block. We use firstUserKey which has the synthetic
726 2 : // prefix.
727 2 : if i.cmp(i.firstUserKey, key) < 0 {
728 2 : return i.Last()
729 2 : }
730 : // Set the offset to the beginning of the block to mimic an exhausted
731 : // iterator that has conducted backward interation. This ensures a
732 : // subsequent Next() call returns the first key in the block.
733 2 : i.offset = -1
734 2 : i.nextOffset = 0
735 2 : return nil
736 : }
737 2 : searchKey = key[len(syntheticPrefix):]
738 : }
739 :
740 2 : i.clearCache()
741 2 : // Find the index of the smallest restart point whose key is >= the key
742 2 : // sought; index will be numRestarts if there is no such restart point.
743 2 : i.offset = 0
744 2 : var index int32
745 2 :
746 2 : {
747 2 : // NB: manually inlined sort.Search is ~5% faster.
748 2 : //
749 2 : // Define f(-1) == false and f(n) == true.
750 2 : // Invariant: f(index-1) == false, f(upper) == true.
751 2 : upper := i.numRestarts
752 2 : for index < upper {
753 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
754 2 : // index ≤ h < upper
755 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
756 2 : // For a restart point, there are 0 bytes shared with the previous key.
757 2 : // The varint encoding of 0 occupies 1 byte.
758 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
759 2 :
760 2 : // Decode the key at that restart point, and compare it to the key
761 2 : // sought. See the comment in readEntry for why we manually inline the
762 2 : // varint decoding.
763 2 : var v1 uint32
764 2 : if a := *((*uint8)(ptr)); a < 128 {
765 2 : v1 = uint32(a)
766 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
767 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
768 0 : v1 = uint32(b)<<7 | uint32(a)
769 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
770 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
771 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
772 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
773 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
774 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
775 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
776 0 : } else {
777 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
778 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
779 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
780 0 : }
781 :
782 2 : if *((*uint8)(ptr)) < 128 {
783 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
784 2 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
785 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
786 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
787 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
788 0 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
789 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
790 0 : } else {
791 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
792 0 : }
793 :
794 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
795 : // speedup on BlockIter benchmarks.
796 2 : s := getBytes(ptr, int(v1))
797 2 : var k []byte
798 2 : if n := len(s) - 8; n >= 0 {
799 2 : k = s[:n:n]
800 2 : }
801 : // Else k is invalid, and left as nil
802 :
803 2 : if i.cmp(searchKey, k) > 0 {
804 2 : // The search key is greater than the user key at this restart point.
805 2 : // Search beyond this restart point, since we are trying to find the
806 2 : // first restart point with a user key >= the search key.
807 2 : index = h + 1 // preserves f(i-1) == false
808 2 : } else {
809 2 : // k >= search key, so prune everything after index (since index
810 2 : // satisfies the property we are looking for).
811 2 : upper = h // preserves f(j) == true
812 2 : }
813 : }
814 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
815 : // => answer is index.
816 : }
817 :
818 2 : if index == 0 {
819 2 : if i.transforms.HasSyntheticSuffix() {
820 2 : // The binary search was conducted on keys without suffix replacement,
821 2 : // implying the first key in the block may be less than the search key. To
822 2 : // double check, get the first key in the block with suffix replacement
823 2 : // and compare to the search key. Consider the following example: suppose
824 2 : // the user searches with a@3, the first key in the block is a@2 and the
825 2 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
826 2 : // a@2, the binary search would return index==0. Without conducting the
827 2 : // suffix replacement, the SeekLT would incorrectly return nil. With
828 2 : // suffix replacement though, a@4 should be returned as a@4 sorts before
829 2 : // a@3.
830 2 : ikv := i.First()
831 2 : if i.cmp(ikv.K.UserKey, key) < 0 {
832 2 : return ikv
833 2 : }
834 : }
835 : // If index == 0 then all keys in this block are larger than the key
836 : // sought, so there is no match.
837 2 : i.offset = -1
838 2 : i.nextOffset = 0
839 2 : return nil
840 : }
841 :
842 : // INVARIANT: index > 0
843 :
844 : // Ignoring suffix replacement, index is the first restart point with key >=
845 : // search key. Define the keys between a restart point and the next restart
846 : // point as belonging to that restart point. Note that index could be equal to
847 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
848 : // increasing, then the restart point at index-1 will be the first one that
849 : // has some keys belonging to it that are less than the search key.
850 : //
851 : // Next, we will search between the restart at index-1 and the restart point
852 : // at index, for the first key >= key, and then on finding it, return
853 : // i.Prev(). We need to know when we have hit the offset for index, since then
854 : // we can stop searching. targetOffset encodes that offset for index.
855 2 : targetOffset := i.restarts
856 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
857 2 : if index < i.numRestarts {
858 2 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
859 2 :
860 2 : if i.transforms.HasSyntheticSuffix() {
861 2 : // The binary search was conducted on keys without suffix replacement,
862 2 : // implying the returned restart point (index) may be less than the search
863 2 : // key, breaking the assumption described above.
864 2 : //
865 2 : // For example: consider this block with a replacement ts of 4, and
866 2 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
867 2 : // replacement: a@4,b@4,c@4
868 2 : //
869 2 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
870 2 : //
871 2 : // If the client calls SeekLT(b@3), the binary search would return b@2,
872 2 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
873 2 : // begin forward iteration from a@3, the previous restart point, to
874 2 : // b{suffix}. The iteration stops when it encounters a key geq to the
875 2 : // search key or if it reaches the upper bound. Without suffix
876 2 : // replacement, we can assume that the upper bound of this forward
877 2 : // iteration, b{suffix}, is greater than the search key, as implied by the
878 2 : // binary search.
879 2 : //
880 2 : // If we naively hold this assumption with suffix replacement, the
881 2 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
882 2 : // incorrectly return a@4. To correct for this, if the original returned
883 2 : // index is less than the search key, shift our forward iteration to begin
884 2 : // at index instead of index -1. With suffix replacement the key at index
885 2 : // is guaranteed to be the highest restart point less than the seach key
886 2 : // (i.e. the same property of index-1 for a block without suffix
887 2 : // replacement). This property holds because of the invariant that a block
888 2 : // with suffix replacement will not have two keys that share the same
889 2 : // prefix. To consider the above example, binary searching with b@3 landed
890 2 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
891 2 : // begin at b@4. We never need to shift by more than one restart point
892 2 : // (i.e. to c@4) because it's impossible for the search key to be greater
893 2 : // than the key at the next restart point in the block because that
894 2 : // key will always have a different prefix. Put another way, because no
895 2 : // key in the block shares the same prefix, naive binary search should
896 2 : // always land at most 1 restart point off the correct one.
897 2 :
898 2 : naiveOffset := i.offset
899 2 : // Shift up to the original binary search result and decode the key.
900 2 : i.offset = targetOffset
901 2 : i.readEntry()
902 2 : i.decodeInternalKey(i.key)
903 2 : i.maybeReplaceSuffix()
904 2 :
905 2 : // If the binary search point is actually less than the search key, post
906 2 : // replacement, bump the target offset.
907 2 : if i.cmp(i.ikv.K.UserKey, key) < 0 {
908 2 : i.offset = targetOffset
909 2 : if index+1 < i.numRestarts {
910 1 : // if index+1 is within the i.data bounds, use it to find the target
911 1 : // offset.
912 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
913 2 : } else {
914 2 : targetOffset = i.restarts
915 2 : }
916 2 : } else {
917 2 : i.offset = naiveOffset
918 2 : }
919 : }
920 : }
921 :
922 : // Init nextOffset for the forward iteration below.
923 2 : i.nextOffset = i.offset
924 2 :
925 2 : for {
926 2 : i.offset = i.nextOffset
927 2 : i.readEntry()
928 2 : // When hidden keys are common, there is additional optimization possible
929 2 : // by not caching entries that are hidden (note that some calls to
930 2 : // cacheEntry don't decode the internal key before caching, but checking
931 2 : // whether a key is hidden does not require full decoding). However, we do
932 2 : // need to use the blockEntry.offset in the cache for the first entry at
933 2 : // the reset point to do the binary search when the cache is empty -- so
934 2 : // we would need to cache that first entry (though not the key) even if
935 2 : // was hidden. Our current assumption is that if there are large numbers
936 2 : // of hidden keys we will be able to skip whole blocks (using block
937 2 : // property filters) so we don't bother optimizing.
938 2 : hiddenPoint := i.decodeInternalKey(i.key)
939 2 : i.maybeReplaceSuffix()
940 2 :
941 2 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
942 2 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
943 2 : // that we can reverse.
944 2 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
945 2 : // The current key is greater than or equal to our search key. Back up to
946 2 : // the previous key which was less than our search key. Note that this for
947 2 : // loop will execute at least once with this if-block not being true, so
948 2 : // the key we are backing up to is the last one this loop cached.
949 2 : return i.Prev()
950 2 : }
951 :
952 2 : if i.nextOffset >= targetOffset {
953 2 : // We've reached the end of the current restart block. Return the
954 2 : // current key if not hidden, else call Prev().
955 2 : //
956 2 : // When the restart interval is 1, the first iteration of the for loop
957 2 : // will bring us here. In that case ikey is backed by the block so we
958 2 : // get the desired key stability guarantee for the lifetime of the
959 2 : // blockIter. That is, we never cache anything and therefore never
960 2 : // return a key backed by cachedBuf.
961 2 : if hiddenPoint {
962 2 : return i.Prev()
963 2 : }
964 2 : break
965 : }
966 2 : i.cacheEntry()
967 : }
968 :
969 2 : if !i.Valid() {
970 2 : return nil
971 2 : }
972 2 : if !i.lazyValueHandling.hasValuePrefix ||
973 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
974 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
975 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
976 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
977 2 : } else {
978 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
979 2 : }
980 2 : return &i.ikv
981 : }
982 :
983 : // First implements internalIterator.First, as documented in the pebble
984 : // package.
985 2 : func (i *Iter) First() *base.InternalKV {
986 2 : if invariants.Enabled && i.IsDataInvalidated() {
987 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
988 : }
989 :
990 2 : i.offset = 0
991 2 : if !i.Valid() {
992 2 : return nil
993 2 : }
994 2 : i.clearCache()
995 2 : i.readEntry()
996 2 : hiddenPoint := i.decodeInternalKey(i.key)
997 2 : if hiddenPoint {
998 2 : return i.Next()
999 2 : }
1000 2 : i.maybeReplaceSuffix()
1001 2 : if !i.lazyValueHandling.hasValuePrefix ||
1002 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1003 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1004 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1005 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1006 2 : } else {
1007 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1008 2 : }
1009 2 : return &i.ikv
1010 : }
1011 :
1012 : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
1013 : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
1014 :
1015 2 : func decodeRestart(b []byte) int32 {
1016 2 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
1017 2 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
1018 2 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
1019 2 : }
1020 :
1021 : // Last implements internalIterator.Last, as documented in the pebble package.
1022 2 : func (i *Iter) Last() *base.InternalKV {
1023 2 : if invariants.Enabled && i.IsDataInvalidated() {
1024 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
1025 : }
1026 :
1027 : // Seek forward from the last restart point.
1028 2 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
1029 2 : if !i.Valid() {
1030 2 : return nil
1031 2 : }
1032 :
1033 2 : i.readEntry()
1034 2 : i.clearCache()
1035 2 :
1036 2 : for i.nextOffset < i.restarts {
1037 2 : i.cacheEntry()
1038 2 : i.offset = i.nextOffset
1039 2 : i.readEntry()
1040 2 : }
1041 :
1042 2 : hiddenPoint := i.decodeInternalKey(i.key)
1043 2 : if hiddenPoint {
1044 2 : return i.Prev()
1045 2 : }
1046 2 : i.maybeReplaceSuffix()
1047 2 : if !i.lazyValueHandling.hasValuePrefix ||
1048 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1049 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1050 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1051 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1052 2 : } else {
1053 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1054 2 : }
1055 2 : return &i.ikv
1056 : }
1057 :
1058 : // Next implements internalIterator.Next, as documented in the pebble
1059 : // package.
1060 2 : func (i *Iter) Next() *base.InternalKV {
1061 2 : if len(i.cachedBuf) > 0 {
1062 2 : // We're switching from reverse iteration to forward iteration. We need to
1063 2 : // populate i.fullKey with the current key we're positioned at so that
1064 2 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1065 2 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1066 2 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1067 2 : // copying into i.fullKey works for both cases.
1068 2 : //
1069 2 : // TODO(peter): Rather than clearing the cache, we could instead use the
1070 2 : // cache until it is exhausted. This would likely be faster than falling
1071 2 : // through to the normal forward iteration code below.
1072 2 : i.fullKey = append(i.fullKey[:0], i.key...)
1073 2 : i.clearCache()
1074 2 : }
1075 :
1076 : start:
1077 2 : i.offset = i.nextOffset
1078 2 : if !i.Valid() {
1079 2 : return nil
1080 2 : }
1081 2 : i.readEntry()
1082 2 : // Manually inlined version of i.decodeInternalKey(i.key).
1083 2 : if n := len(i.key) - 8; n >= 0 {
1084 2 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1085 2 : hiddenPoint := i.transforms.HideObsoletePoints &&
1086 2 : (trailer&TrailerObsoleteBit != 0)
1087 2 : i.ikv.K.Trailer = trailer & TrailerObsoleteMask
1088 2 : i.ikv.K.UserKey = i.key[:n:n]
1089 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1090 2 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1091 2 : }
1092 2 : if hiddenPoint {
1093 2 : goto start
1094 : }
1095 2 : if i.transforms.HasSyntheticSuffix() {
1096 2 : // Inlined version of i.maybeReplaceSuffix()
1097 2 : prefixLen := i.split(i.ikv.K.UserKey)
1098 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1099 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...)
1100 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1101 2 : }
1102 0 : } else {
1103 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1104 0 : i.ikv.K.UserKey = nil
1105 0 : }
1106 2 : if !i.lazyValueHandling.hasValuePrefix ||
1107 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1108 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1109 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1110 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1111 2 : } else {
1112 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1113 2 : }
1114 2 : return &i.ikv
1115 : }
1116 :
1117 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1118 2 : func (i *Iter) NextPrefix(succKey []byte) *base.InternalKV {
1119 2 : if i.lazyValueHandling.hasValuePrefix {
1120 2 : return i.nextPrefixV3(succKey)
1121 2 : }
1122 2 : const nextsBeforeSeek = 3
1123 2 : kv := i.Next()
1124 2 : for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
1125 2 : if j >= nextsBeforeSeek {
1126 2 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1127 2 : }
1128 2 : kv = i.Next()
1129 : }
1130 2 : return kv
1131 : }
1132 :
1133 2 : func (i *Iter) nextPrefixV3(succKey []byte) *base.InternalKV {
1134 2 : // Doing nexts that involve a key comparison can be expensive (and the cost
1135 2 : // depends on the key length), so we use the same threshold of 3 that we use
1136 2 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1137 2 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1138 2 : // than doing a SeekGE within the block, so we do this 16 times
1139 2 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1140 2 : // search for the restart consumes > 100ns. If the number of versions is >
1141 2 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1142 2 : // on average need to find a key between two restarts, so another 8 steps
1143 2 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1144 2 : // steps.
1145 2 : //
1146 2 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1147 2 : // was written (which we don't currently store) instead of the default value
1148 2 : // of 16.
1149 2 : const nextCmpThresholdBeforeSeek = 3
1150 2 : const nextFastThresholdBeforeRestarts = 16
1151 2 : nextCmpCount := 0
1152 2 : nextFastCount := 0
1153 2 : usedRestarts := false
1154 2 : // INVARIANT: blockIter is valid.
1155 2 : if invariants.Enabled && !i.Valid() {
1156 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1157 : }
1158 2 : prevKeyIsSet := i.ikv.Kind() == base.InternalKeyKindSet
1159 2 : for {
1160 2 : i.offset = i.nextOffset
1161 2 : if !i.Valid() {
1162 2 : return nil
1163 2 : }
1164 : // Need to decode the length integers, so we can compute nextOffset.
1165 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1166 2 : // This is an ugly performance hack. Reading entries from blocks is one of
1167 2 : // the inner-most routines and decoding the 3 varints per-entry takes
1168 2 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1169 2 : // us, so we do it manually. This provides a 10-15% performance improvement
1170 2 : // on blockIter benchmarks on both go1.11 and go1.12.
1171 2 : //
1172 2 : // TODO(peter): remove this hack if go:inline is ever supported.
1173 2 :
1174 2 : // Decode the shared key length integer.
1175 2 : var shared uint32
1176 2 : if a := *((*uint8)(ptr)); a < 128 {
1177 2 : shared = uint32(a)
1178 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1179 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1180 0 : shared = uint32(b)<<7 | uint32(a)
1181 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1182 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1183 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1184 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1185 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1186 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1187 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1188 0 : } else {
1189 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1190 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1191 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1192 0 : }
1193 : // Decode the unshared key length integer.
1194 2 : var unshared uint32
1195 2 : if a := *((*uint8)(ptr)); a < 128 {
1196 2 : unshared = uint32(a)
1197 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1198 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1199 0 : unshared = uint32(b)<<7 | uint32(a)
1200 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1201 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1202 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1203 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1204 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1205 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1206 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1207 0 : } else {
1208 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1209 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1210 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1211 0 : }
1212 : // Decode the value length integer.
1213 2 : var value uint32
1214 2 : if a := *((*uint8)(ptr)); a < 128 {
1215 2 : value = uint32(a)
1216 2 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1217 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1218 0 : value = uint32(b)<<7 | uint32(a)
1219 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1220 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1221 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1222 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1223 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1224 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1225 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1226 0 : } else {
1227 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1228 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1229 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1230 0 : }
1231 2 : shared += i.transforms.SyntheticPrefixAndSuffix.PrefixLen()
1232 2 : // The starting position of the value.
1233 2 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1234 2 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1235 2 : if invariants.Enabled && unshared < 8 {
1236 0 : // This should not happen since only the key prefix is shared, so even
1237 0 : // if the prefix length is the same as the user key length, the unshared
1238 0 : // will include the trailer.
1239 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1240 : }
1241 : // The trailer is written in little endian, so the key kind is the first
1242 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1243 2 : keyKind := base.InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1244 2 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1245 2 : prefixChanged := false
1246 2 : if keyKind == base.InternalKeyKindSet {
1247 2 : if invariants.Enabled && value == 0 {
1248 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1249 : }
1250 2 : valPrefix := *((*block.ValuePrefix)(valuePtr))
1251 2 : if valPrefix.SetHasSamePrefix() {
1252 2 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1253 2 : // that subsequent keys will not have a shared length that is greater
1254 2 : // than the prefix of the current key, which is also the prefix of
1255 2 : // i.key. Since we are continuing to iterate, we don't need to
1256 2 : // initialize i.ikey and i.lazyValue (these are initialized before
1257 2 : // returning).
1258 2 : nextFastCount++
1259 2 : if nextFastCount > nextFastThresholdBeforeRestarts {
1260 1 : if usedRestarts {
1261 0 : // Exhausted iteration budget. This will never happen unless
1262 0 : // someone is using a restart interval > 16. It is just to guard
1263 0 : // against long restart intervals causing too much iteration.
1264 0 : break
1265 : }
1266 : // Haven't used restarts yet, so find the first restart at or beyond
1267 : // the current offset.
1268 1 : targetOffset := i.offset
1269 1 : var index int32
1270 1 : {
1271 1 : // NB: manually inlined sort.Sort is ~5% faster.
1272 1 : //
1273 1 : // f defined for a restart point is true iff the offset >=
1274 1 : // targetOffset.
1275 1 : // Define f(-1) == false and f(i.numRestarts) == true.
1276 1 : // Invariant: f(index-1) == false, f(upper) == true.
1277 1 : upper := i.numRestarts
1278 1 : for index < upper {
1279 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1280 1 : // index ≤ h < upper
1281 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1282 1 : if offset < targetOffset {
1283 1 : index = h + 1 // preserves f(index-1) == false
1284 1 : } else {
1285 1 : upper = h // preserves f(upper) == true
1286 1 : }
1287 : }
1288 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1289 : // => answer is index.
1290 : }
1291 1 : usedRestarts = true
1292 1 : nextFastCount = 0
1293 1 : if index == i.numRestarts {
1294 1 : // Already past the last real restart, so iterate a bit more until
1295 1 : // we are done with the block.
1296 1 : continue
1297 : }
1298 : // Have some real restarts after index. NB: index is the first
1299 : // restart at or beyond the current offset.
1300 1 : startingIndex := index
1301 1 : for index != i.numRestarts &&
1302 1 : // The restart at index is 4 bytes written in little endian format
1303 1 : // starting at i.restart+4*index. The 0th byte is the least
1304 1 : // significant and the 3rd byte is the most significant. Since the
1305 1 : // most significant bit of the 3rd byte is what we use for
1306 1 : // encoding the set-has-same-prefix information, the indexing
1307 1 : // below has +3.
1308 1 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1309 1 : // We still have the same prefix, so move to the next restart.
1310 1 : index++
1311 1 : }
1312 : // index is the first restart that did not have the same prefix.
1313 1 : if index != startingIndex {
1314 1 : // Managed to skip past at least one restart. Resume iteration
1315 1 : // from index-1. Since nextFastCount has been reset to 0, we
1316 1 : // should be able to iterate to the next prefix.
1317 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1318 1 : i.readEntry()
1319 1 : }
1320 : // Else, unable to skip past any restart. Resume iteration. Since
1321 : // nextFastCount has been reset to 0, we should be able to iterate
1322 : // to the next prefix.
1323 1 : continue
1324 : }
1325 2 : continue
1326 2 : } else if prevKeyIsSet {
1327 2 : prefixChanged = true
1328 2 : }
1329 2 : } else {
1330 2 : prevKeyIsSet = false
1331 2 : }
1332 : // Slow-path cases:
1333 : // - (Likely) The prefix has changed.
1334 : // - (Unlikely) The prefix has not changed.
1335 : // We assemble the key etc. under the assumption that it is the likely
1336 : // case.
1337 2 : unsharedKey := getBytes(ptr, int(unshared))
1338 2 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1339 2 : // since the current logic assumes we have always copied the latest key
1340 2 : // into fullKey, which is why when we get to the next key we can (a)
1341 2 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1342 2 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1343 2 : // valid (even if unshared). For (b), we will need to remember whether
1344 2 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1345 2 : // in the former case and for the latter case need to copy the shared part
1346 2 : // too. This same comment applies to the other place where we can do this
1347 2 : // optimization, in readEntry().
1348 2 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1349 2 : i.val = getBytes(valuePtr, int(value))
1350 2 : if shared == 0 {
1351 2 : // Provide stability for the key across positioning calls if the key
1352 2 : // doesn't share a prefix with the previous key. This removes requiring the
1353 2 : // key to be copied if the caller knows the block has a restart interval of
1354 2 : // 1. An important example of this is range-del blocks.
1355 2 : i.key = unsharedKey
1356 2 : } else {
1357 2 : i.key = i.fullKey
1358 2 : }
1359 : // Manually inlined version of i.decodeInternalKey(i.key).
1360 2 : hiddenPoint := false
1361 2 : if n := len(i.key) - 8; n >= 0 {
1362 2 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1363 2 : hiddenPoint = i.transforms.HideObsoletePoints &&
1364 2 : (trailer&TrailerObsoleteBit != 0)
1365 2 : i.ikv.K = base.InternalKey{
1366 2 : Trailer: trailer & TrailerObsoleteMask,
1367 2 : UserKey: i.key[:n:n],
1368 2 : }
1369 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1370 1 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1371 1 : }
1372 2 : if i.transforms.HasSyntheticSuffix() {
1373 0 : // Inlined version of i.maybeReplaceSuffix()
1374 0 : prefixLen := i.split(i.ikv.K.UserKey)
1375 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1376 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...)
1377 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1378 0 : }
1379 0 : } else {
1380 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1381 0 : i.ikv.K.UserKey = nil
1382 0 : }
1383 2 : nextCmpCount++
1384 2 : if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
1385 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1386 0 : i.ikv.K.UserKey, succKey))
1387 : }
1388 2 : if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
1389 2 : // Prefix has changed.
1390 2 : if hiddenPoint {
1391 1 : return i.Next()
1392 1 : }
1393 2 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1394 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1395 : }
1396 2 : if i.ikv.K.Kind() != base.InternalKeyKindSet {
1397 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1398 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1399 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1400 2 : } else {
1401 0 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1402 0 : }
1403 2 : return &i.ikv
1404 : }
1405 : // Else prefix has not changed.
1406 :
1407 2 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1408 1 : break
1409 : }
1410 : }
1411 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1412 : }
1413 :
1414 : // Prev implements internalIterator.Prev, as documented in the pebble
1415 : // package.
1416 2 : func (i *Iter) Prev() *base.InternalKV {
1417 2 : start:
1418 2 : for n := len(i.cached) - 1; n >= 0; n-- {
1419 2 : i.nextOffset = i.offset
1420 2 : e := &i.cached[n]
1421 2 : i.offset = e.offset
1422 2 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1423 2 : // Manually inlined version of i.decodeInternalKey(i.key).
1424 2 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1425 2 : if n := len(i.key) - 8; n >= 0 {
1426 2 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1427 2 : hiddenPoint := i.transforms.HideObsoletePoints &&
1428 2 : (trailer&TrailerObsoleteBit != 0)
1429 2 : if hiddenPoint {
1430 2 : continue
1431 : }
1432 2 : i.ikv.K = base.InternalKey{
1433 2 : Trailer: trailer & TrailerObsoleteMask,
1434 2 : UserKey: i.key[:n:n],
1435 2 : }
1436 2 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1437 2 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1438 2 : }
1439 2 : if i.transforms.HasSyntheticSuffix() {
1440 2 : // Inlined version of i.maybeReplaceSuffix()
1441 2 : prefixLen := i.split(i.ikv.K.UserKey)
1442 2 : // If ikey is cached or may get cached, we must de-reference
1443 2 : // UserKey before suffix replacement.
1444 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1445 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...)
1446 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1447 2 : }
1448 0 : } else {
1449 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1450 0 : i.ikv.K.UserKey = nil
1451 0 : }
1452 2 : i.cached = i.cached[:n]
1453 2 : if !i.lazyValueHandling.hasValuePrefix ||
1454 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1455 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1456 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1457 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1458 2 : } else {
1459 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1460 2 : }
1461 2 : return &i.ikv
1462 : }
1463 :
1464 2 : i.clearCache()
1465 2 : if i.offset <= 0 {
1466 2 : i.offset = -1
1467 2 : i.nextOffset = 0
1468 2 : return nil
1469 2 : }
1470 :
1471 2 : targetOffset := i.offset
1472 2 : var index int32
1473 2 :
1474 2 : {
1475 2 : // NB: manually inlined sort.Sort is ~5% faster.
1476 2 : //
1477 2 : // Define f(-1) == false and f(n) == true.
1478 2 : // Invariant: f(index-1) == false, f(upper) == true.
1479 2 : upper := i.numRestarts
1480 2 : for index < upper {
1481 2 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1482 2 : // index ≤ h < upper
1483 2 : offset := decodeRestart(i.data[i.restarts+4*h:])
1484 2 : if offset < targetOffset {
1485 2 : // Looking for the first restart that has offset >= targetOffset, so
1486 2 : // ignore h and earlier.
1487 2 : index = h + 1 // preserves f(i-1) == false
1488 2 : } else {
1489 2 : upper = h // preserves f(j) == true
1490 2 : }
1491 : }
1492 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1493 : // => answer is index.
1494 : }
1495 :
1496 : // index is first restart with offset >= targetOffset. Note that
1497 : // targetOffset may not be at a restart point since one can call Prev()
1498 : // after Next() (so the cache was not populated) and targetOffset refers to
1499 : // the current entry. index-1 must have an offset < targetOffset (it can't
1500 : // be equal to targetOffset since the binary search would have selected that
1501 : // as the index).
1502 2 : i.offset = 0
1503 2 : if index > 0 {
1504 2 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1505 2 : }
1506 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1507 : // valid offset.
1508 :
1509 2 : i.readEntry()
1510 2 :
1511 2 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1512 2 : // entry we are stepping back from, and we don't need to cache the entry
1513 2 : // before it, since it is the candidate to return.
1514 2 : for i.nextOffset < targetOffset {
1515 2 : i.cacheEntry()
1516 2 : i.offset = i.nextOffset
1517 2 : i.readEntry()
1518 2 : }
1519 :
1520 2 : hiddenPoint := i.decodeInternalKey(i.key)
1521 2 : if hiddenPoint {
1522 2 : // Use the cache.
1523 2 : goto start
1524 : }
1525 2 : if i.transforms.HasSyntheticSuffix() {
1526 2 : // Inlined version of i.maybeReplaceSuffix()
1527 2 : prefixLen := i.split(i.ikv.K.UserKey)
1528 2 : // If ikey is cached or may get cached, we must de-reference
1529 2 : // UserKey before suffix replacement.
1530 2 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1531 2 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...)
1532 2 : i.ikv.K.UserKey = i.synthSuffixBuf
1533 2 : }
1534 2 : if !i.lazyValueHandling.hasValuePrefix ||
1535 2 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1536 2 : i.ikv.V = base.MakeInPlaceValue(i.val)
1537 2 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1538 2 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1539 2 : } else {
1540 2 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1541 2 : }
1542 2 : return &i.ikv
1543 : }
1544 :
1545 : // Key returns the internal key at the current iterator position.
1546 0 : func (i *Iter) Key() *base.InternalKey {
1547 0 : return &i.ikv.K
1548 0 : }
1549 :
1550 : // KV returns the internal KV at the current iterator position.
1551 2 : func (i *Iter) KV() *base.InternalKV {
1552 2 : return &i.ikv
1553 2 : }
1554 :
1555 : // Value returns the value at the current iterator position.
1556 0 : func (i *Iter) Value() base.LazyValue {
1557 0 : return i.ikv.V
1558 0 : }
1559 :
1560 : // Error implements internalIterator.Error, as documented in the pebble
1561 : // package.
1562 2 : func (i *Iter) Error() error {
1563 2 : return nil // infallible
1564 2 : }
1565 :
1566 : // Close implements internalIterator.Close, as documented in the pebble
1567 : // package.
1568 2 : func (i *Iter) Close() error {
1569 2 : i.handle.Release()
1570 2 : fullKey := i.fullKey[:0]
1571 2 : cached := i.cached[:0]
1572 2 : cachedBuf := i.cachedBuf[:0]
1573 2 : firstUserKeyWithPrefixBuf := i.firstUserKeyWithPrefixBuf[:0]
1574 2 : *i = Iter{
1575 2 : fullKey: fullKey,
1576 2 : cached: cached,
1577 2 : cachedBuf: cachedBuf,
1578 2 : firstUserKeyWithPrefixBuf: firstUserKeyWithPrefixBuf,
1579 2 : }
1580 2 : return nil
1581 2 : }
1582 :
1583 : // SetBounds implements base.InternalIterator. It panics, as bounds should
1584 : // always be handled the by the parent sstable iterator.
1585 0 : func (i *Iter) SetBounds(lower, upper []byte) {
1586 0 : // This should never be called as bounds are handled by sstable.Iterator.
1587 0 : panic("pebble: SetBounds unimplemented")
1588 : }
1589 :
1590 : // SetContext implements base.InternalIterator.
1591 0 : func (i *Iter) SetContext(_ context.Context) {}
1592 :
1593 : // Valid returns true if the iterator is currently positioned at a valid KV.
1594 2 : func (i *Iter) Valid() bool {
1595 2 : return i.offset >= 0 && i.offset < i.restarts
1596 2 : }
1597 :
1598 : // DebugTree is part of the InternalIterator interface.
1599 0 : func (i *Iter) DebugTree(tp treeprinter.Node) {
1600 0 : tp.Childf("%T(%p)", i, i)
1601 0 : }
1602 :
1603 1 : func (i *Iter) getRestart(idx int) int32 {
1604 1 : return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
1605 1 : }
1606 :
1607 1 : func (i *Iter) isRestartPoint() bool {
1608 1 : j := sort.Search(int(i.numRestarts), func(j int) bool {
1609 1 : return i.getRestart(j) >= i.offset
1610 1 : })
1611 1 : return j < int(i.numRestarts) && i.getRestart(j) == i.offset
1612 : }
1613 :
1614 : // DescribeKV is a function that formats a key-value pair, writing the
1615 : // description to w.
1616 : type DescribeKV func(w io.Writer, key *base.InternalKey, val []byte, enc KVEncoding)
1617 :
1618 : // KVEncoding describes the encoding of a key-value pair within the block.
1619 : type KVEncoding struct {
1620 : // IsRestart is true if the key is a restart point.
1621 : IsRestart bool
1622 : // Offset is the position within the block at which the key-value pair is
1623 : // encoded.
1624 : Offset int32
1625 : // Length is the total length of the KV pair as it is encoded in the block
1626 : // format.
1627 : Length int32
1628 : // KeyShared is the number of bytes this KV's user key shared with its predecessor.
1629 : KeyShared uint32
1630 : // KeyUnshared is the number of bytes this KV's user key did not share with
1631 : // its predecessor.
1632 : KeyUnshared uint32
1633 : // ValueLen is the length of the internal value.
1634 : ValueLen uint32
1635 : }
1636 :
1637 : // Describe describes the contents of a block, writing the description to w.
1638 : // It invokes fmtKV to describe each key-value pair.
1639 1 : func (i *Iter) Describe(tp treeprinter.Node, fmtKV DescribeKV) {
1640 1 : var buf bytes.Buffer
1641 1 : for kv := i.First(); kv != nil; kv = i.Next() {
1642 1 : enc := KVEncoding{
1643 1 : IsRestart: i.isRestartPoint(),
1644 1 : Offset: i.offset,
1645 1 : Length: int32(i.nextOffset - i.offset),
1646 1 : }
1647 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1648 1 : enc.KeyShared, ptr = decodeVarint(ptr)
1649 1 : enc.KeyUnshared, ptr = decodeVarint(ptr)
1650 1 : enc.ValueLen, _ = decodeVarint(ptr)
1651 1 : buf.Reset()
1652 1 : fmtKV(&buf, &kv.K, kv.V.ValueOrHandle, enc)
1653 1 : tp.Child(buf.String())
1654 1 : }
1655 : // Format the restart points.
1656 1 : n := tp.Child("restart points")
1657 1 : // Format the restart points.
1658 1 : for j := 0; j < int(i.numRestarts); j++ {
1659 1 : offset := i.getRestart(j)
1660 1 : n.Childf("%05d [restart %d]", uint64(i.restarts+4*int32(j)), offset)
1661 1 : }
1662 : }
1663 :
1664 : // RawIter is an iterator over a single block of data. Unlike blockIter,
1665 : // keys are stored in "raw" format (i.e. not as internal keys). Note that there
1666 : // is significant similarity between this code and the code in blockIter. Yet
1667 : // reducing duplication is difficult due to the blockIter being performance
1668 : // critical. RawIter must only be used for blocks where the value is
1669 : // stored together with the key.
1670 : type RawIter struct {
1671 : cmp base.Compare
1672 : offset int32
1673 : nextOffset int32
1674 : restarts int32
1675 : numRestarts int32
1676 : ptr unsafe.Pointer
1677 : data []byte
1678 : key, val []byte
1679 : ikey base.InternalKey
1680 : cached []blockEntry
1681 : cachedBuf []byte
1682 : }
1683 :
1684 : // NewRawIter constructs a new raw block iterator.
1685 2 : func NewRawIter(cmp base.Compare, block []byte) (*RawIter, error) {
1686 2 : i := &RawIter{}
1687 2 : return i, i.Init(cmp, block)
1688 2 : }
1689 :
1690 : // Init initializes the raw block iterator.
1691 2 : func (i *RawIter) Init(cmp base.Compare, blk []byte) error {
1692 2 : numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
1693 2 : if numRestarts == 0 {
1694 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
1695 0 : }
1696 2 : i.cmp = cmp
1697 2 : i.restarts = int32(len(blk)) - 4*(1+numRestarts)
1698 2 : i.numRestarts = numRestarts
1699 2 : i.ptr = unsafe.Pointer(&blk[0])
1700 2 : i.data = blk
1701 2 : if i.key == nil {
1702 2 : i.key = make([]byte, 0, 256)
1703 2 : } else {
1704 0 : i.key = i.key[:0]
1705 0 : }
1706 2 : i.val = nil
1707 2 : i.clearCache()
1708 2 : return nil
1709 : }
1710 :
1711 2 : func (i *RawIter) readEntry() {
1712 2 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1713 2 : shared, ptr := decodeVarint(ptr)
1714 2 : unshared, ptr := decodeVarint(ptr)
1715 2 : value, ptr := decodeVarint(ptr)
1716 2 : i.key = append(i.key[:shared], getBytes(ptr, int(unshared))...)
1717 2 : i.key = i.key[:len(i.key):len(i.key)]
1718 2 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1719 2 : i.val = getBytes(ptr, int(value))
1720 2 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
1721 2 : }
1722 :
1723 2 : func (i *RawIter) loadEntry() {
1724 2 : i.readEntry()
1725 2 : i.ikey.UserKey = i.key
1726 2 : }
1727 :
1728 2 : func (i *RawIter) clearCache() {
1729 2 : i.cached = i.cached[:0]
1730 2 : i.cachedBuf = i.cachedBuf[:0]
1731 2 : }
1732 :
1733 1 : func (i *RawIter) cacheEntry() {
1734 1 : var valStart int32
1735 1 : valSize := int32(len(i.val))
1736 1 : if valSize > 0 {
1737 0 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
1738 0 : }
1739 :
1740 1 : i.cached = append(i.cached, blockEntry{
1741 1 : offset: i.offset,
1742 1 : keyStart: int32(len(i.cachedBuf)),
1743 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
1744 1 : valStart: valStart,
1745 1 : valSize: valSize,
1746 1 : })
1747 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
1748 : }
1749 :
1750 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
1751 : // package.
1752 1 : func (i *RawIter) SeekGE(key []byte) bool {
1753 1 : // Find the index of the smallest restart point whose key is > the key
1754 1 : // sought; index will be numRestarts if there is no such restart point.
1755 1 : i.offset = 0
1756 1 : index := sort.Search(int(i.numRestarts), func(j int) bool {
1757 1 : offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
1758 1 : // For a restart point, there are 0 bytes shared with the previous key.
1759 1 : // The varint encoding of 0 occupies 1 byte.
1760 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
1761 1 : // Decode the key at that restart point, and compare it to the key sought.
1762 1 : v1, ptr := decodeVarint(ptr)
1763 1 : _, ptr = decodeVarint(ptr)
1764 1 : s := getBytes(ptr, int(v1))
1765 1 : return i.cmp(key, s) < 0
1766 1 : })
1767 :
1768 : // Since keys are strictly increasing, if index > 0 then the restart point at
1769 : // index-1 will be the largest whose key is <= the key sought. If index ==
1770 : // 0, then all keys in this block are larger than the key sought, and offset
1771 : // remains at zero.
1772 1 : if index > 0 {
1773 1 : i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
1774 1 : }
1775 1 : i.loadEntry()
1776 1 :
1777 1 : // Iterate from that restart point to somewhere >= the key sought.
1778 1 : for valid := i.Valid(); valid; valid = i.Next() {
1779 1 : if i.cmp(key, i.key) <= 0 {
1780 1 : break
1781 : }
1782 : }
1783 1 : return i.Valid()
1784 : }
1785 :
1786 : // First implements internalIterator.First, as documented in the pebble
1787 : // package.
1788 2 : func (i *RawIter) First() bool {
1789 2 : i.offset = 0
1790 2 : i.loadEntry()
1791 2 : return i.Valid()
1792 2 : }
1793 :
1794 : // Last implements internalIterator.Last, as documented in the pebble package.
1795 1 : func (i *RawIter) Last() bool {
1796 1 : // Seek forward from the last restart point.
1797 1 : i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):]))
1798 1 :
1799 1 : i.readEntry()
1800 1 : i.clearCache()
1801 1 : i.cacheEntry()
1802 1 :
1803 1 : for i.nextOffset < i.restarts {
1804 1 : i.offset = i.nextOffset
1805 1 : i.readEntry()
1806 1 : i.cacheEntry()
1807 1 : }
1808 :
1809 1 : i.ikey.UserKey = i.key
1810 1 : return i.Valid()
1811 : }
1812 :
1813 : // Next implements internalIterator.Next, as documented in the pebble
1814 : // package.
1815 2 : func (i *RawIter) Next() bool {
1816 2 : i.offset = i.nextOffset
1817 2 : if !i.Valid() {
1818 2 : return false
1819 2 : }
1820 2 : i.loadEntry()
1821 2 : return true
1822 : }
1823 :
1824 : // Prev implements internalIterator.Prev, as documented in the pebble
1825 : // package.
1826 1 : func (i *RawIter) Prev() bool {
1827 1 : if n := len(i.cached) - 1; n > 0 && i.cached[n].offset == i.offset {
1828 1 : i.nextOffset = i.offset
1829 1 : e := &i.cached[n-1]
1830 1 : i.offset = e.offset
1831 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1832 1 : i.ikey.UserKey = i.cachedBuf[e.keyStart:e.keyEnd]
1833 1 : i.cached = i.cached[:n]
1834 1 : return true
1835 1 : }
1836 :
1837 1 : if i.offset == 0 {
1838 1 : i.offset = -1
1839 1 : i.nextOffset = 0
1840 1 : return false
1841 1 : }
1842 :
1843 0 : targetOffset := i.offset
1844 0 : index := sort.Search(int(i.numRestarts), func(j int) bool {
1845 0 : offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
1846 0 : return offset >= targetOffset
1847 0 : })
1848 0 : i.offset = 0
1849 0 : if index > 0 {
1850 0 : i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
1851 0 : }
1852 :
1853 0 : i.readEntry()
1854 0 : i.clearCache()
1855 0 : i.cacheEntry()
1856 0 :
1857 0 : for i.nextOffset < targetOffset {
1858 0 : i.offset = i.nextOffset
1859 0 : i.readEntry()
1860 0 : i.cacheEntry()
1861 0 : }
1862 :
1863 0 : i.ikey.UserKey = i.key
1864 0 : return true
1865 : }
1866 :
1867 : // Key implements internalIterator.Key, as documented in the pebble package.
1868 2 : func (i *RawIter) Key() base.InternalKey {
1869 2 : return i.ikey
1870 2 : }
1871 :
1872 : // Value implements internalIterator.Value, as documented in the pebble
1873 : // package.
1874 2 : func (i *RawIter) Value() []byte {
1875 2 : return i.val
1876 2 : }
1877 :
1878 : // Valid implements internalIterator.Valid, as documented in the pebble
1879 : // package.
1880 2 : func (i *RawIter) Valid() bool {
1881 2 : return i.offset >= 0 && i.offset < i.restarts
1882 2 : }
1883 :
1884 : // Error implements internalIterator.Error, as documented in the pebble
1885 : // package.
1886 0 : func (i *RawIter) Error() error {
1887 0 : return nil
1888 0 : }
1889 :
1890 : // Close implements internalIterator.Close, as documented in the pebble
1891 : // package.
1892 2 : func (i *RawIter) Close() error {
1893 2 : i.val = nil
1894 2 : return nil
1895 2 : }
1896 :
1897 : // DebugTree is part of the InternalIterator interface.
1898 0 : func (i *RawIter) DebugTree(tp treeprinter.Node) {
1899 0 : tp.Childf("%T(%p)", i, i)
1900 0 : }
1901 :
1902 1 : func (i *RawIter) getRestart(idx int) int32 {
1903 1 : return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
1904 1 : }
1905 :
1906 1 : func (i *RawIter) isRestartPoint() bool {
1907 1 : j := sort.Search(int(i.numRestarts), func(j int) bool {
1908 1 : return i.getRestart(j) >= i.offset
1909 1 : })
1910 1 : return j < int(i.numRestarts) && i.getRestart(j) == i.offset
1911 : }
1912 :
1913 : // Describe describes the contents of a block, writing the description to w.
1914 : // It invokes fmtKV to describe each key-value pair.
1915 1 : func (i *RawIter) Describe(tp treeprinter.Node, fmtKV DescribeKV) {
1916 1 : var buf bytes.Buffer
1917 1 : for valid := i.First(); valid; valid = i.Next() {
1918 1 : enc := KVEncoding{
1919 1 : IsRestart: i.isRestartPoint(),
1920 1 : Offset: i.offset,
1921 1 : Length: int32(i.nextOffset - i.offset),
1922 1 : }
1923 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1924 1 : enc.KeyShared, ptr = decodeVarint(ptr)
1925 1 : enc.KeyUnshared, ptr = decodeVarint(ptr)
1926 1 : enc.ValueLen, _ = decodeVarint(ptr)
1927 1 : buf.Reset()
1928 1 : fmtKV(&buf, &i.ikey, i.val, enc)
1929 1 : if i.isRestartPoint() {
1930 1 : buf.WriteString(" [restart]")
1931 1 : }
1932 1 : tp.Child(buf.String())
1933 : }
1934 1 : n := tp.Child("restart points")
1935 1 : // Format the restart points.
1936 1 : for j := 0; j < int(i.numRestarts); j++ {
1937 1 : offset := i.getRestart(j)
1938 1 : n.Childf("%05d [restart %d]", uint64(i.restarts+4*int32(j)), offset)
1939 1 : }
1940 : }
1941 :
1942 2 : func getBytes(ptr unsafe.Pointer, length int) []byte {
1943 2 : return (*[manual.MaxArrayLen]byte)(ptr)[:length:length]
1944 2 : }
1945 :
1946 2 : func decodeVarint(ptr unsafe.Pointer) (uint32, unsafe.Pointer) {
1947 2 : if a := *((*uint8)(ptr)); a < 128 {
1948 2 : return uint32(a),
1949 2 : unsafe.Pointer(uintptr(ptr) + 1)
1950 2 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1951 1 : return uint32(b)<<7 | uint32(a),
1952 1 : unsafe.Pointer(uintptr(ptr) + 2)
1953 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1954 1 : return uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1955 1 : unsafe.Pointer(uintptr(ptr) + 3)
1956 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1957 1 : return uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1958 1 : unsafe.Pointer(uintptr(ptr) + 4)
1959 1 : } else {
1960 1 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1961 1 : return uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1962 1 : unsafe.Pointer(uintptr(ptr) + 5)
1963 1 : }
1964 : }
|