Line data Source code
1 : // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package rowblk
6 :
7 : import (
8 : "bytes"
9 : "context"
10 : "encoding/binary"
11 : "fmt"
12 : "io"
13 : "slices"
14 : "sort"
15 : "unsafe"
16 :
17 : "github.com/cockroachdb/errors"
18 : "github.com/cockroachdb/pebble/internal/base"
19 : "github.com/cockroachdb/pebble/internal/invariants"
20 : "github.com/cockroachdb/pebble/internal/manual"
21 : "github.com/cockroachdb/pebble/internal/treeprinter"
22 : "github.com/cockroachdb/pebble/sstable/block"
23 : )
24 :
25 : // Iter is an iterator over a single block of data.
26 : //
27 : // An Iter provides an additional guarantee around key stability when a block
28 : // has a restart interval of 1 (i.e. when there is no prefix compression). Key
29 : // stability refers to whether the InternalKey.UserKey bytes returned by a
30 : // positioning call will remain stable after a subsequent positioning call. The
31 : // normal case is that a positioning call will invalidate any previously
32 : // returned InternalKey.UserKey. If a block has a restart interval of 1 (no
33 : // prefix compression), Iter guarantees that InternalKey.UserKey will point to
34 : // the key as stored in the block itself which will remain valid until the Iter
35 : // is closed. The key stability guarantee is used by the range tombstone and
36 : // range key code, which knows that the respective blocks are always encoded
37 : // with a restart interval of 1. This per-block key stability guarantee is
38 : // sufficient for range tombstones and range deletes as they are always encoded
39 : // in a single block. Note: this stability guarantee no longer holds for a block
40 : // iter with synthetic prefix/suffix replacement, but we don't use the synthetic
41 : // suffix/prefix functionality of Iter for range keys.
42 : //
43 : // An Iter also provides a value stability guarantee for range deletions and
44 : // range keys since there is only a single range deletion and range key block
45 : // per sstable and the Iter will not release the bytes for the block until it is
46 : // closed.
47 : //
48 : // Note on why Iter knows about lazyValueHandling:
49 : //
50 : // Iter's positioning functions (that return a LazyValue), are too
51 : // complex to inline even prior to lazyValueHandling. Iter.Next and
52 : // Iter.First were by far the cheapest and had costs 195 and 180
53 : // respectively, which exceeds the budget of 80. We initially tried to keep
54 : // the lazyValueHandling logic out of Iter by wrapping it with a
55 : // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
56 : // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
57 : // they called the corresponding Iter func and then decided whether the
58 : // value was in fact in-place (so return immediately) or needed further
59 : // handling. But these also turned out too costly for mid-stack inlining since
60 : // simple calls like the following have a high cost that is barely under the
61 : // budget of 80
62 : //
63 : // k, v := i.data.SeekGE(key, flags) // cost 74
64 : // k, v := i.data.Next() // cost 72
65 : //
66 : // We have 2 options for minimizing performance regressions:
67 : // - Include the lazyValueHandling logic in the already non-inlineable
68 : // Iter functions: Since most of the time is spent in data block iters,
69 : // it is acceptable to take the small hit of unnecessary branching (which
70 : // hopefully branch prediction will predict correctly) for other kinds of
71 : // blocks.
72 : // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
73 : // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
74 : // v3 sstable. We would want to manage these copies via code generation.
75 : //
76 : // We have picked the first option here.
77 : type Iter struct {
78 : cmp base.Compare
79 : split base.Split
80 :
81 : // Iterator transforms.
82 : //
83 : // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix
84 : // before the key is returned to the user. A sequence of iter operations on a
85 : // block with a syntheticSuffix rule should return keys as if those operations
86 : // ran on a block with keys that all had the syntheticSuffix. As an example:
87 : // any sequence of block iter cmds should return the same keys for the
88 : // following two blocks:
89 : //
90 : // blockA: a@3,b@3,c@3
91 : // blockB: a@1,b@2,c@1 with syntheticSuffix=3
92 : //
93 : // To ensure this, Suffix replacement will not change the ordering of keys in
94 : // the block because the iter assumes that no two keys in the block share the
95 : // same prefix. Furthermore, during SeekGE and SeekLT operations, the block
96 : // iterator handles "off by one" errors (explained in more detail in those
97 : // functions) when, for a given key, originalSuffix < searchSuffix <
98 : // replacementSuffix, with integer comparison. To handle these cases, the
99 : // iterator assumes:
100 : //
101 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0
102 : // for keys with a suffix.
103 : //
104 : // NB: it is possible for a block iter to add a synthetic suffix on a key
105 : // without a suffix, which implies
106 : // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 ,
107 : // however, the iterator would never need to handle an off by one error in
108 : // this case since originalSuffix (empty) > searchSuffix (non empty), with
109 : // integer comparison.
110 : //
111 : //
112 : // In addition, we also assume that any block with rangekeys will not contain
113 : // a synthetic suffix.
114 : transforms block.IterTransforms
115 :
116 : // offset is the byte index that marks where the current key/value is
117 : // encoded in the block.
118 : offset int32
119 : // nextOffset is the byte index where the next key/value is encoded in the
120 : // block.
121 : nextOffset int32
122 : // A "restart point" in a block is a point where the full key is encoded,
123 : // instead of just having a suffix of the key encoded. See readEntry() for
124 : // how prefix compression of keys works. Keys in between two restart points
125 : // only have a suffix encoded in the block. When restart interval is 1, no
126 : // prefix compression of keys happens. This is the case with range tombstone
127 : // blocks.
128 : //
129 : // All restart offsets are listed in increasing order in
130 : // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
131 : // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
132 : // therefore be seen as the point where data in the block ends, and a list
133 : // of offsets of all restart points begins.
134 : restarts int32
135 : // Number of restart points in this block. Encoded at the end of the block
136 : // as a uint32.
137 : numRestarts int32
138 : ptr unsafe.Pointer
139 : data []byte
140 : // key contains the raw key the iterator is currently pointed at. This may
141 : // point directly to data stored in the block (for a key which has no prefix
142 : // compression), to fullKey (for a prefix compressed key), or to a slice of
143 : // data stored in cachedBuf (during reverse iteration).
144 : //
145 : // NB: In general, key contains the same logical content as ikey
146 : // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix
147 : // replacement rule, this will not be the case. Therefore, key should never
148 : // be used after ikey is set.
149 : key []byte
150 : // fullKey is a buffer used for key prefix decompression. Note that if
151 : // transforms.SyntheticPrifix is not nil, fullKey always starts with that
152 : // prefix.
153 : fullKey []byte
154 : // val contains the value the iterator is currently pointed at. If non-nil,
155 : // this points to a slice of the block data.
156 : val []byte
157 : // ikv contains the decoded internal KV the iterator is currently positioned
158 : // at.
159 : //
160 : // ikv.InternalKey contains the decoded InternalKey the iterator is
161 : // currently pointed at. Note that the memory backing ikv.UserKey is either
162 : // data stored directly in the block, fullKey, or cachedBuf. The key
163 : // stability guarantee for blocks built with a restart interval of 1 is
164 : // achieved by having ikv.UserKey always point to data stored directly in
165 : // the block.
166 : //
167 : // ikv.LazyValue is val turned into a LazyValue, whenever a positioning
168 : // method returns a non-nil key-value pair.
169 : ikv base.InternalKV
170 : // cached and cachedBuf are used during reverse iteration. They are needed
171 : // because we can't perform prefix decoding in reverse, only in the forward
172 : // direction. In order to iterate in reverse, we decode and cache the entries
173 : // between two restart points.
174 : //
175 : // Note that cached[len(cached)-1] contains the previous entry to the one the
176 : // blockIter is currently pointed at. As usual, nextOffset will contain the
177 : // offset of the next entry. During reverse iteration, nextOffset will be
178 : // updated to point to offset, and we'll set the blockIter to point at the
179 : // entry cached[len(cached)-1]. See Prev() for more details.
180 : //
181 : // For a block encoded with a restart interval of 1, cached and cachedBuf
182 : // will not be used as there are no prefix compressed entries between the
183 : // restart points.
184 : cached []blockEntry
185 : cachedBuf []byte
186 : handle block.BufferHandle
187 : // for block iteration for already loaded blocks.
188 : firstUserKey []byte
189 : lazyValueHandling struct {
190 : getValue block.GetLazyValueForPrefixAndValueHandler
191 : hasValuePrefix bool
192 : }
193 : synthSuffixBuf []byte
194 : firstUserKeyWithPrefixBuf []byte
195 : }
196 :
197 : type blockEntry struct {
198 : offset int32
199 : keyStart int32
200 : keyEnd int32
201 : valStart int32
202 : valSize int32
203 : }
204 :
205 : // blockIter implements the base.InternalIterator interface.
206 : var _ base.InternalIterator = (*Iter)(nil)
207 :
208 : // NewIter constructs a new row-oriented block iterator over the provided serialized block.
209 : func NewIter(
210 : cmp base.Compare, split base.Split, block []byte, transforms block.IterTransforms,
211 1 : ) (*Iter, error) {
212 1 : i := &Iter{}
213 1 : return i, i.Init(cmp, split, block, transforms)
214 1 : }
215 :
216 : // String implements fmt.Stringer.
217 0 : func (i *Iter) String() string {
218 0 : return "block"
219 0 : }
220 :
221 : // Init initializes the block iterator from the provided block.
222 : func (i *Iter) Init(
223 : cmp base.Compare, split base.Split, blk []byte, transforms block.IterTransforms,
224 1 : ) error {
225 1 : numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
226 1 : if numRestarts == 0 {
227 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
228 0 : }
229 1 : i.transforms = transforms
230 1 : i.synthSuffixBuf = i.synthSuffixBuf[:0]
231 1 : i.split = split
232 1 : i.cmp = cmp
233 1 : i.restarts = int32(len(blk)) - 4*(1+numRestarts)
234 1 : i.numRestarts = numRestarts
235 1 : i.ptr = unsafe.Pointer(&blk[0])
236 1 : i.data = blk
237 1 : if i.transforms.SyntheticPrefix.IsSet() {
238 1 : i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix...)
239 1 : } else {
240 1 : i.fullKey = i.fullKey[:0]
241 1 : }
242 1 : i.val = nil
243 1 : i.clearCache()
244 1 : if i.restarts > 0 {
245 1 : if err := i.readFirstKey(); err != nil {
246 0 : return err
247 0 : }
248 1 : } else {
249 1 : // Block is empty.
250 1 : i.firstUserKey = nil
251 1 : }
252 1 : return nil
253 : }
254 :
255 : // InitHandle initializes an iterator from the provided block handle.
256 : // NB: two cases of hideObsoletePoints:
257 : // - Local sstable iteration: syntheticSeqNum will be set iff the sstable was
258 : // ingested.
259 : // - Foreign sstable iteration: syntheticSeqNum is always set.
260 : func (i *Iter) InitHandle(
261 : cmp base.Compare, split base.Split, block block.BufferHandle, transforms block.IterTransforms,
262 1 : ) error {
263 1 : i.handle.Release()
264 1 : i.handle = block
265 1 : return i.Init(cmp, split, block.Get(), transforms)
266 1 : }
267 :
268 : // SetHasValuePrefix sets whether or not the block iterator should expect values
269 : // corresponding to Set keys to have a prefix byte.
270 1 : func (i *Iter) SetHasValuePrefix(hasValuePrefix bool) {
271 1 : i.lazyValueHandling.hasValuePrefix = hasValuePrefix
272 1 : }
273 :
274 : // SetGetLazyValuer sets the value block reader the iterator should use to get
275 : // lazy values when the value encodes a value prefix.
276 1 : func (i *Iter) SetGetLazyValuer(g block.GetLazyValueForPrefixAndValueHandler) {
277 1 : i.lazyValueHandling.getValue = g
278 1 :
279 1 : }
280 :
281 : // Handle returns the underlying block buffer handle, if the iterator was
282 : // initialized with one.
283 1 : func (i *Iter) Handle() block.BufferHandle {
284 1 : return i.handle
285 1 : }
286 :
287 : // Invalidate invalidates the block iterator, removing references to the block
288 : // it was initialized with.
289 1 : func (i *Iter) Invalidate() {
290 1 : i.clearCache()
291 1 : i.offset = 0
292 1 : i.nextOffset = 0
293 1 : i.restarts = 0
294 1 : i.numRestarts = 0
295 1 : i.data = nil
296 1 : }
297 :
298 : // IsDataInvalidated returns true when the blockIter has been invalidated
299 : // using an invalidate call. NB: this is different from blockIter.Valid
300 : // which is part of the InternalIterator implementation.
301 1 : func (i *Iter) IsDataInvalidated() bool {
302 1 : return i.data == nil
303 1 : }
304 :
305 : // ResetForReuse resets the blockIter for reuse, retaining buffers to avoid
306 : // future allocations.
307 1 : func (i *Iter) ResetForReuse() Iter {
308 1 : return Iter{
309 1 : fullKey: i.fullKey[:0],
310 1 : cached: i.cached[:0],
311 1 : cachedBuf: i.cachedBuf[:0],
312 1 : firstUserKeyWithPrefixBuf: i.firstUserKeyWithPrefixBuf[:0],
313 1 : data: nil,
314 1 : }
315 1 : }
316 :
317 1 : func (i *Iter) readEntry() {
318 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
319 1 :
320 1 : // This is an ugly performance hack. Reading entries from blocks is one of
321 1 : // the inner-most routines and decoding the 3 varints per-entry takes
322 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
323 1 : // us, so we do it manually. This provides a 10-15% performance improvement
324 1 : // on blockIter benchmarks on both go1.11 and go1.12.
325 1 : //
326 1 : // TODO(peter): remove this hack if go:inline is ever supported.
327 1 :
328 1 : var shared uint32
329 1 : if a := *((*uint8)(ptr)); a < 128 {
330 1 : shared = uint32(a)
331 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
332 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
333 0 : shared = uint32(b)<<7 | uint32(a)
334 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
335 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
336 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
337 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
338 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
339 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
340 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
341 0 : } else {
342 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
343 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
344 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
345 0 : }
346 :
347 1 : var unshared uint32
348 1 : if a := *((*uint8)(ptr)); a < 128 {
349 1 : unshared = uint32(a)
350 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
351 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
352 0 : unshared = uint32(b)<<7 | uint32(a)
353 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
354 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
355 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
356 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
357 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
358 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
359 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
360 0 : } else {
361 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
362 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
363 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
364 0 : }
365 :
366 1 : var value uint32
367 1 : if a := *((*uint8)(ptr)); a < 128 {
368 1 : value = uint32(a)
369 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
370 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
371 1 : value = uint32(b)<<7 | uint32(a)
372 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
373 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
374 1 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
375 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
376 1 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
377 1 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
378 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
379 1 : } else {
380 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
381 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
382 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
383 0 : }
384 1 : shared += uint32(len(i.transforms.SyntheticPrefix))
385 1 : unsharedKey := getBytes(ptr, int(unshared))
386 1 : // TODO(sumeer): move this into the else block below.
387 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
388 1 : if shared == 0 {
389 1 : // Provide stability for the key across positioning calls if the key
390 1 : // doesn't share a prefix with the previous key. This removes requiring the
391 1 : // key to be copied if the caller knows the block has a restart interval of
392 1 : // 1. An important example of this is range-del blocks.
393 1 : i.key = unsharedKey
394 1 : } else {
395 1 : i.key = i.fullKey
396 1 : }
397 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
398 1 : i.val = getBytes(ptr, int(value))
399 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
400 : }
401 :
402 1 : func (i *Iter) readFirstKey() error {
403 1 : ptr := i.ptr
404 1 :
405 1 : // This is an ugly performance hack. Reading entries from blocks is one of
406 1 : // the inner-most routines and decoding the 3 varints per-entry takes
407 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
408 1 : // us, so we do it manually. This provides a 10-15% performance improvement
409 1 : // on blockIter benchmarks on both go1.11 and go1.12.
410 1 : //
411 1 : // TODO(peter): remove this hack if go:inline is ever supported.
412 1 :
413 1 : if shared := *((*uint8)(ptr)); shared == 0 {
414 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
415 1 : } else {
416 0 : // The shared length is != 0, which is invalid.
417 0 : panic("first key in block must have zero shared length")
418 : }
419 :
420 1 : var unshared uint32
421 1 : if a := *((*uint8)(ptr)); a < 128 {
422 1 : unshared = uint32(a)
423 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
424 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
425 0 : unshared = uint32(b)<<7 | uint32(a)
426 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
427 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
428 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
429 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
430 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
431 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
432 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
433 0 : } else {
434 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
435 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
436 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
437 0 : }
438 :
439 : // Skip the value length.
440 1 : if a := *((*uint8)(ptr)); a < 128 {
441 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
442 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
443 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
444 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
445 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
446 1 : } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
447 1 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
448 1 : } else {
449 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
450 0 : }
451 :
452 1 : firstKey := getBytes(ptr, int(unshared))
453 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
454 1 : // BlockIter benchmarks.
455 1 : if n := len(firstKey) - 8; n >= 0 {
456 1 : i.firstUserKey = firstKey[:n:n]
457 1 : } else {
458 0 : i.firstUserKey = nil
459 0 : return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
460 0 : }
461 1 : if i.transforms.SyntheticPrefix != nil {
462 1 : i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(i.transforms.SyntheticPrefix)+len(i.firstUserKey))
463 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.transforms.SyntheticPrefix...)
464 1 : i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...)
465 1 : i.firstUserKey = i.firstUserKeyWithPrefixBuf
466 1 : }
467 1 : return nil
468 : }
469 :
470 1 : func (i *Iter) decodeInternalKey(key []byte) (hiddenPoint bool) {
471 1 : // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
472 1 : // BlockIter benchmarks.
473 1 : if n := len(key) - 8; n >= 0 {
474 1 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(key[n:]))
475 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
476 1 : (trailer&TrailerObsoleteBit != 0)
477 1 : i.ikv.K.Trailer = trailer & TrailerObsoleteMask
478 1 : i.ikv.K.UserKey = key[:n:n]
479 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
480 1 : i.ikv.K.SetSeqNum(base.SeqNum(n))
481 1 : }
482 1 : } else {
483 1 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
484 1 : i.ikv.K.UserKey = nil
485 1 : }
486 1 : return hiddenPoint
487 : }
488 :
489 : // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with
490 : // i.transforms.syntheticSuffix.
491 1 : func (i *Iter) maybeReplaceSuffix() {
492 1 : if i.transforms.SyntheticSuffix.IsSet() && i.ikv.K.UserKey != nil {
493 1 : prefixLen := i.split(i.ikv.K.UserKey)
494 1 : // If ikey is cached or may get cached, we must copy
495 1 : // UserKey to a new buffer before suffix replacement.
496 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
497 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
498 1 : i.ikv.K.UserKey = i.synthSuffixBuf
499 1 : }
500 : }
501 :
502 1 : func (i *Iter) clearCache() {
503 1 : i.cached = i.cached[:0]
504 1 : i.cachedBuf = i.cachedBuf[:0]
505 1 : }
506 :
507 1 : func (i *Iter) cacheEntry() {
508 1 : var valStart int32
509 1 : valSize := int32(len(i.val))
510 1 : if valSize > 0 {
511 1 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
512 1 : }
513 :
514 1 : i.cached = append(i.cached, blockEntry{
515 1 : offset: i.offset,
516 1 : keyStart: int32(len(i.cachedBuf)),
517 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
518 1 : valStart: valStart,
519 1 : valSize: valSize,
520 1 : })
521 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
522 : }
523 :
524 : // FirstUserKey returns the user key of the first key in the block.
525 1 : func (i *Iter) FirstUserKey() []byte {
526 1 : return i.firstUserKey
527 1 : }
528 :
529 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
530 : // package.
531 1 : func (i *Iter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
532 1 : if invariants.Enabled && i.IsDataInvalidated() {
533 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
534 : }
535 1 : searchKey := key
536 1 : if i.transforms.SyntheticPrefix != nil {
537 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
538 1 : // The seek key is before or after the entire block of keys that start
539 1 : // with SyntheticPrefix. To determine which, we need to compare against a
540 1 : // valid key in the block. We use firstUserKey which has the synthetic
541 1 : // prefix.
542 1 : if i.cmp(i.firstUserKey, key) >= 0 {
543 1 : return i.First()
544 1 : }
545 : // Set the offset to the end of the block to mimic the offset of an
546 : // invalid iterator. This ensures a subsequent i.Prev() returns a valid
547 : // result.
548 1 : i.offset = i.restarts
549 1 : i.nextOffset = i.restarts
550 1 : return nil
551 : }
552 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
553 : }
554 :
555 1 : i.clearCache()
556 1 : // Find the index of the smallest restart point whose key is > the key
557 1 : // sought; index will be numRestarts if there is no such restart point.
558 1 : i.offset = 0
559 1 : var index int32
560 1 :
561 1 : {
562 1 : // NB: manually inlined sort.Seach is ~5% faster.
563 1 : //
564 1 : // Define f(-1) == false and f(n) == true.
565 1 : // Invariant: f(index-1) == false, f(upper) == true.
566 1 : upper := i.numRestarts
567 1 : for index < upper {
568 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
569 1 : // index ≤ h < upper
570 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
571 1 : // For a restart point, there are 0 bytes shared with the previous key.
572 1 : // The varint encoding of 0 occupies 1 byte.
573 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
574 1 :
575 1 : // Decode the key at that restart point, and compare it to the key
576 1 : // sought. See the comment in readEntry for why we manually inline the
577 1 : // varint decoding.
578 1 : var v1 uint32
579 1 : if a := *((*uint8)(ptr)); a < 128 {
580 1 : v1 = uint32(a)
581 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
582 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
583 0 : v1 = uint32(b)<<7 | uint32(a)
584 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
585 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
586 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
587 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
588 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
589 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
590 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
591 0 : } else {
592 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
593 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
594 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
595 0 : }
596 :
597 1 : if *((*uint8)(ptr)) < 128 {
598 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
599 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
600 1 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
601 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
602 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
603 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
604 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
605 0 : } else {
606 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
607 0 : }
608 :
609 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
610 : // speedup on BlockIter benchmarks.
611 1 : s := getBytes(ptr, int(v1))
612 1 : var k []byte
613 1 : if n := len(s) - 8; n >= 0 {
614 1 : k = s[:n:n]
615 1 : }
616 : // Else k is invalid, and left as nil
617 :
618 1 : if i.cmp(searchKey, k) > 0 {
619 1 : // The search key is greater than the user key at this restart point.
620 1 : // Search beyond this restart point, since we are trying to find the
621 1 : // first restart point with a user key >= the search key.
622 1 : index = h + 1 // preserves f(i-1) == false
623 1 : } else {
624 1 : // k >= search key, so prune everything after index (since index
625 1 : // satisfies the property we are looking for).
626 1 : upper = h // preserves f(j) == true
627 1 : }
628 : }
629 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
630 : // => answer is index.
631 : }
632 :
633 : // index is the first restart point with key >= search key. Define the keys
634 : // between a restart point and the next restart point as belonging to that
635 : // restart point.
636 : //
637 : // Since keys are strictly increasing, if index > 0 then the restart point
638 : // at index-1 will be the first one that has some keys belonging to it that
639 : // could be equal to the search key. If index == 0, then all keys in this
640 : // block are larger than the key sought, and offset remains at zero.
641 1 : if index > 0 {
642 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
643 1 : }
644 1 : i.readEntry()
645 1 : hiddenPoint := i.decodeInternalKey(i.key)
646 1 :
647 1 : // Iterate from that restart point to somewhere >= the key sought.
648 1 : if !i.Valid() {
649 1 : return nil
650 1 : }
651 :
652 : // A note on seeking in a block with a suffix replacement rule: even though
653 : // the binary search above was conducted on keys without suffix replacement,
654 : // Seek will still return the correct suffix replaced key. A binary
655 : // search without suffix replacement will land on a key that is _less_ than
656 : // the key the search would have landed on if all keys were already suffix
657 : // replaced. Since Seek then conducts forward iteration to the first suffix
658 : // replaced user key that is greater than or equal to the search key, the
659 : // correct key is still returned.
660 : //
661 : // As an example, consider the following block with a restart interval of 1,
662 : // with a replacement suffix of "4":
663 : // - Pre-suffix replacement: apple@1, banana@3
664 : // - Post-suffix replacement: apple@4, banana@4
665 : //
666 : // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse
667 : // chronological order (i.e. apple@1>apple@3), the binary search without
668 : // suffix replacement would return apple@1. A binary search with suffix
669 : // replacement would return banana@4. After beginning forward iteration from
670 : // either returned restart point, forward iteration would
671 : // always return the correct key, banana@4.
672 : //
673 : // Further, if the user searched with apple@0 (i.e. a suffix less than the
674 : // pre replacement suffix) or with apple@5 (a suffix larger than the post
675 : // replacement suffix), the binary search with or without suffix replacement
676 : // would land on the same key, as we assume the following:
677 : // (1) no two keys in the sst share the same prefix.
678 : // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0
679 :
680 1 : i.maybeReplaceSuffix()
681 1 :
682 1 : if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 {
683 1 : // Initialize i.lazyValue
684 1 : if !i.lazyValueHandling.hasValuePrefix ||
685 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
686 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
687 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
688 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
689 1 : } else {
690 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
691 1 : }
692 1 : return &i.ikv
693 : }
694 1 : for i.Next(); i.Valid(); i.Next() {
695 1 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
696 1 : // i.Next() has already initialized i.ikv.LazyValue.
697 1 : return &i.ikv
698 1 : }
699 : }
700 1 : return nil
701 : }
702 :
703 : // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
704 : // pebble package.
705 0 : func (i *Iter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV {
706 0 : // This should never be called as prefix iteration is handled by sstable.Iterator.
707 0 : panic("pebble: SeekPrefixGE unimplemented")
708 : }
709 :
710 : // SeekLT implements internalIterator.SeekLT, as documented in the pebble
711 : // package.
712 1 : func (i *Iter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
713 1 : if invariants.Enabled && i.IsDataInvalidated() {
714 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
715 : }
716 1 : searchKey := key
717 1 : if i.transforms.SyntheticPrefix != nil {
718 1 : if !bytes.HasPrefix(key, i.transforms.SyntheticPrefix) {
719 1 : // The seek key is before or after the entire block of keys that start
720 1 : // with SyntheticPrefix. To determine which, we need to compare against a
721 1 : // valid key in the block. We use firstUserKey which has the synthetic
722 1 : // prefix.
723 1 : if i.cmp(i.firstUserKey, key) < 0 {
724 1 : return i.Last()
725 1 : }
726 : // Set the offset to the beginning of the block to mimic an exhausted
727 : // iterator that has conducted backward interation. This ensures a
728 : // subsequent Next() call returns the first key in the block.
729 1 : i.offset = -1
730 1 : i.nextOffset = 0
731 1 : return nil
732 : }
733 1 : searchKey = key[len(i.transforms.SyntheticPrefix):]
734 : }
735 :
736 1 : i.clearCache()
737 1 : // Find the index of the smallest restart point whose key is >= the key
738 1 : // sought; index will be numRestarts if there is no such restart point.
739 1 : i.offset = 0
740 1 : var index int32
741 1 :
742 1 : {
743 1 : // NB: manually inlined sort.Search is ~5% faster.
744 1 : //
745 1 : // Define f(-1) == false and f(n) == true.
746 1 : // Invariant: f(index-1) == false, f(upper) == true.
747 1 : upper := i.numRestarts
748 1 : for index < upper {
749 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
750 1 : // index ≤ h < upper
751 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
752 1 : // For a restart point, there are 0 bytes shared with the previous key.
753 1 : // The varint encoding of 0 occupies 1 byte.
754 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
755 1 :
756 1 : // Decode the key at that restart point, and compare it to the key
757 1 : // sought. See the comment in readEntry for why we manually inline the
758 1 : // varint decoding.
759 1 : var v1 uint32
760 1 : if a := *((*uint8)(ptr)); a < 128 {
761 1 : v1 = uint32(a)
762 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
763 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
764 0 : v1 = uint32(b)<<7 | uint32(a)
765 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
766 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
767 0 : v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
768 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
769 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
770 0 : v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
771 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
772 0 : } else {
773 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
774 0 : v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
775 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
776 0 : }
777 :
778 1 : if *((*uint8)(ptr)) < 128 {
779 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
780 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
781 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
782 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
783 1 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
784 1 : } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
785 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
786 0 : } else {
787 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
788 0 : }
789 :
790 : // Manually inlining part of base.DecodeInternalKey provides a 5-10%
791 : // speedup on BlockIter benchmarks.
792 1 : s := getBytes(ptr, int(v1))
793 1 : var k []byte
794 1 : if n := len(s) - 8; n >= 0 {
795 1 : k = s[:n:n]
796 1 : }
797 : // Else k is invalid, and left as nil
798 :
799 1 : if i.cmp(searchKey, k) > 0 {
800 1 : // The search key is greater than the user key at this restart point.
801 1 : // Search beyond this restart point, since we are trying to find the
802 1 : // first restart point with a user key >= the search key.
803 1 : index = h + 1 // preserves f(i-1) == false
804 1 : } else {
805 1 : // k >= search key, so prune everything after index (since index
806 1 : // satisfies the property we are looking for).
807 1 : upper = h // preserves f(j) == true
808 1 : }
809 : }
810 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
811 : // => answer is index.
812 : }
813 :
814 1 : if index == 0 {
815 1 : if i.transforms.SyntheticSuffix.IsSet() {
816 1 : // The binary search was conducted on keys without suffix replacement,
817 1 : // implying the first key in the block may be less than the search key. To
818 1 : // double check, get the first key in the block with suffix replacement
819 1 : // and compare to the search key. Consider the following example: suppose
820 1 : // the user searches with a@3, the first key in the block is a@2 and the
821 1 : // block contains a suffix replacement rule of 4. Since a@3 sorts before
822 1 : // a@2, the binary search would return index==0. Without conducting the
823 1 : // suffix replacement, the SeekLT would incorrectly return nil. With
824 1 : // suffix replacement though, a@4 should be returned as a@4 sorts before
825 1 : // a@3.
826 1 : ikv := i.First()
827 1 : if i.cmp(ikv.K.UserKey, key) < 0 {
828 1 : return ikv
829 1 : }
830 : }
831 : // If index == 0 then all keys in this block are larger than the key
832 : // sought, so there is no match.
833 1 : i.offset = -1
834 1 : i.nextOffset = 0
835 1 : return nil
836 : }
837 :
838 : // INVARIANT: index > 0
839 :
840 : // Ignoring suffix replacement, index is the first restart point with key >=
841 : // search key. Define the keys between a restart point and the next restart
842 : // point as belonging to that restart point. Note that index could be equal to
843 : // i.numRestarts, i.e., we are past the last restart. Since keys are strictly
844 : // increasing, then the restart point at index-1 will be the first one that
845 : // has some keys belonging to it that are less than the search key.
846 : //
847 : // Next, we will search between the restart at index-1 and the restart point
848 : // at index, for the first key >= key, and then on finding it, return
849 : // i.Prev(). We need to know when we have hit the offset for index, since then
850 : // we can stop searching. targetOffset encodes that offset for index.
851 1 : targetOffset := i.restarts
852 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
853 1 : if index < i.numRestarts {
854 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
855 1 :
856 1 : if i.transforms.SyntheticSuffix.IsSet() {
857 1 : // The binary search was conducted on keys without suffix replacement,
858 1 : // implying the returned restart point (index) may be less than the search
859 1 : // key, breaking the assumption described above.
860 1 : //
861 1 : // For example: consider this block with a replacement ts of 4, and
862 1 : // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post
863 1 : // replacement: a@4,b@4,c@4
864 1 : //
865 1 : // Suppose the client calls SeekLT(b@3), SeekLT must return b@4.
866 1 : //
867 1 : // If the client calls SeekLT(b@3), the binary search would return b@2,
868 1 : // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will
869 1 : // begin forward iteration from a@3, the previous restart point, to
870 1 : // b{suffix}. The iteration stops when it encounters a key geq to the
871 1 : // search key or if it reaches the upper bound. Without suffix
872 1 : // replacement, we can assume that the upper bound of this forward
873 1 : // iteration, b{suffix}, is greater than the search key, as implied by the
874 1 : // binary search.
875 1 : //
876 1 : // If we naively hold this assumption with suffix replacement, the
877 1 : // iteration would terminate at the upper bound, b@4, call i.Prev, and
878 1 : // incorrectly return a@4. To correct for this, if the original returned
879 1 : // index is less than the search key, shift our forward iteration to begin
880 1 : // at index instead of index -1. With suffix replacement the key at index
881 1 : // is guaranteed to be the highest restart point less than the seach key
882 1 : // (i.e. the same property of index-1 for a block without suffix
883 1 : // replacement). This property holds because of the invariant that a block
884 1 : // with suffix replacement will not have two keys that share the same
885 1 : // prefix. To consider the above example, binary searching with b@3 landed
886 1 : // naively at a@3, but since b@4<b@3, we shift our forward iteration to
887 1 : // begin at b@4. We never need to shift by more than one restart point
888 1 : // (i.e. to c@4) because it's impossible for the search key to be greater
889 1 : // than the key at the next restart point in the block because that
890 1 : // key will always have a different prefix. Put another way, because no
891 1 : // key in the block shares the same prefix, naive binary search should
892 1 : // always land at most 1 restart point off the correct one.
893 1 :
894 1 : naiveOffset := i.offset
895 1 : // Shift up to the original binary search result and decode the key.
896 1 : i.offset = targetOffset
897 1 : i.readEntry()
898 1 : i.decodeInternalKey(i.key)
899 1 : i.maybeReplaceSuffix()
900 1 :
901 1 : // If the binary search point is actually less than the search key, post
902 1 : // replacement, bump the target offset.
903 1 : if i.cmp(i.ikv.K.UserKey, key) < 0 {
904 1 : i.offset = targetOffset
905 1 : if index+1 < i.numRestarts {
906 1 : // if index+1 is within the i.data bounds, use it to find the target
907 1 : // offset.
908 1 : targetOffset = decodeRestart(i.data[i.restarts+4*(index+1):])
909 1 : } else {
910 1 : targetOffset = i.restarts
911 1 : }
912 1 : } else {
913 1 : i.offset = naiveOffset
914 1 : }
915 : }
916 : }
917 :
918 : // Init nextOffset for the forward iteration below.
919 1 : i.nextOffset = i.offset
920 1 :
921 1 : for {
922 1 : i.offset = i.nextOffset
923 1 : i.readEntry()
924 1 : // When hidden keys are common, there is additional optimization possible
925 1 : // by not caching entries that are hidden (note that some calls to
926 1 : // cacheEntry don't decode the internal key before caching, but checking
927 1 : // whether a key is hidden does not require full decoding). However, we do
928 1 : // need to use the blockEntry.offset in the cache for the first entry at
929 1 : // the reset point to do the binary search when the cache is empty -- so
930 1 : // we would need to cache that first entry (though not the key) even if
931 1 : // was hidden. Our current assumption is that if there are large numbers
932 1 : // of hidden keys we will be able to skip whole blocks (using block
933 1 : // property filters) so we don't bother optimizing.
934 1 : hiddenPoint := i.decodeInternalKey(i.key)
935 1 : i.maybeReplaceSuffix()
936 1 :
937 1 : // NB: we don't use the hiddenPoint return value of decodeInternalKey
938 1 : // since we want to stop as soon as we reach a key >= ikey.UserKey, so
939 1 : // that we can reverse.
940 1 : if i.cmp(i.ikv.K.UserKey, key) >= 0 {
941 1 : // The current key is greater than or equal to our search key. Back up to
942 1 : // the previous key which was less than our search key. Note that this for
943 1 : // loop will execute at least once with this if-block not being true, so
944 1 : // the key we are backing up to is the last one this loop cached.
945 1 : return i.Prev()
946 1 : }
947 :
948 1 : if i.nextOffset >= targetOffset {
949 1 : // We've reached the end of the current restart block. Return the
950 1 : // current key if not hidden, else call Prev().
951 1 : //
952 1 : // When the restart interval is 1, the first iteration of the for loop
953 1 : // will bring us here. In that case ikey is backed by the block so we
954 1 : // get the desired key stability guarantee for the lifetime of the
955 1 : // blockIter. That is, we never cache anything and therefore never
956 1 : // return a key backed by cachedBuf.
957 1 : if hiddenPoint {
958 1 : return i.Prev()
959 1 : }
960 1 : break
961 : }
962 1 : i.cacheEntry()
963 : }
964 :
965 1 : if !i.Valid() {
966 1 : return nil
967 1 : }
968 1 : if !i.lazyValueHandling.hasValuePrefix ||
969 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
970 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
971 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
972 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
973 1 : } else {
974 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
975 1 : }
976 1 : return &i.ikv
977 : }
978 :
979 : // First implements internalIterator.First, as documented in the pebble
980 : // package.
981 1 : func (i *Iter) First() *base.InternalKV {
982 1 : if invariants.Enabled && i.IsDataInvalidated() {
983 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
984 : }
985 :
986 1 : i.offset = 0
987 1 : if !i.Valid() {
988 1 : return nil
989 1 : }
990 1 : i.clearCache()
991 1 : i.readEntry()
992 1 : hiddenPoint := i.decodeInternalKey(i.key)
993 1 : if hiddenPoint {
994 1 : return i.Next()
995 1 : }
996 1 : i.maybeReplaceSuffix()
997 1 : if !i.lazyValueHandling.hasValuePrefix ||
998 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
999 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1000 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1001 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1002 1 : } else {
1003 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1004 1 : }
1005 1 : return &i.ikv
1006 : }
1007 :
1008 : const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
1009 : const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
1010 :
1011 1 : func decodeRestart(b []byte) int32 {
1012 1 : _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
1013 1 : return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
1014 1 : uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
1015 1 : }
1016 :
1017 : // Last implements internalIterator.Last, as documented in the pebble package.
1018 1 : func (i *Iter) Last() *base.InternalKV {
1019 1 : if invariants.Enabled && i.IsDataInvalidated() {
1020 0 : panic(errors.AssertionFailedf("invalidated blockIter used"))
1021 : }
1022 :
1023 : // Seek forward from the last restart point.
1024 1 : i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
1025 1 : if !i.Valid() {
1026 1 : return nil
1027 1 : }
1028 :
1029 1 : i.readEntry()
1030 1 : i.clearCache()
1031 1 :
1032 1 : for i.nextOffset < i.restarts {
1033 1 : i.cacheEntry()
1034 1 : i.offset = i.nextOffset
1035 1 : i.readEntry()
1036 1 : }
1037 :
1038 1 : hiddenPoint := i.decodeInternalKey(i.key)
1039 1 : if hiddenPoint {
1040 1 : return i.Prev()
1041 1 : }
1042 1 : i.maybeReplaceSuffix()
1043 1 : if !i.lazyValueHandling.hasValuePrefix ||
1044 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1045 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1046 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1047 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1048 1 : } else {
1049 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1050 1 : }
1051 1 : return &i.ikv
1052 : }
1053 :
1054 : // Next implements internalIterator.Next, as documented in the pebble
1055 : // package.
1056 1 : func (i *Iter) Next() *base.InternalKV {
1057 1 : if len(i.cachedBuf) > 0 {
1058 1 : // We're switching from reverse iteration to forward iteration. We need to
1059 1 : // populate i.fullKey with the current key we're positioned at so that
1060 1 : // readEntry() can use i.fullKey for key prefix decompression. Note that we
1061 1 : // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
1062 1 : // SeekLT was the previous call, i.key may be backed by i.fullKey), but
1063 1 : // copying into i.fullKey works for both cases.
1064 1 : //
1065 1 : // TODO(peter): Rather than clearing the cache, we could instead use the
1066 1 : // cache until it is exhausted. This would likely be faster than falling
1067 1 : // through to the normal forward iteration code below.
1068 1 : i.fullKey = append(i.fullKey[:0], i.key...)
1069 1 : i.clearCache()
1070 1 : }
1071 :
1072 : start:
1073 1 : i.offset = i.nextOffset
1074 1 : if !i.Valid() {
1075 1 : return nil
1076 1 : }
1077 1 : i.readEntry()
1078 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1079 1 : if n := len(i.key) - 8; n >= 0 {
1080 1 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1081 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1082 1 : (trailer&TrailerObsoleteBit != 0)
1083 1 : i.ikv.K.Trailer = trailer & TrailerObsoleteMask
1084 1 : i.ikv.K.UserKey = i.key[:n:n]
1085 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1086 1 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1087 1 : }
1088 1 : if hiddenPoint {
1089 1 : goto start
1090 : }
1091 1 : if i.transforms.SyntheticSuffix.IsSet() {
1092 1 : // Inlined version of i.maybeReplaceSuffix()
1093 1 : prefixLen := i.split(i.ikv.K.UserKey)
1094 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1095 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1096 1 : i.ikv.K.UserKey = i.synthSuffixBuf
1097 1 : }
1098 0 : } else {
1099 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1100 0 : i.ikv.K.UserKey = nil
1101 0 : }
1102 1 : if !i.lazyValueHandling.hasValuePrefix ||
1103 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1104 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1105 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1106 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1107 1 : } else {
1108 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1109 1 : }
1110 1 : return &i.ikv
1111 : }
1112 :
1113 : // NextPrefix implements (base.InternalIterator).NextPrefix.
1114 1 : func (i *Iter) NextPrefix(succKey []byte) *base.InternalKV {
1115 1 : if i.lazyValueHandling.hasValuePrefix {
1116 1 : return i.nextPrefixV3(succKey)
1117 1 : }
1118 1 : const nextsBeforeSeek = 3
1119 1 : kv := i.Next()
1120 1 : for j := 1; kv != nil && i.cmp(kv.K.UserKey, succKey) < 0; j++ {
1121 1 : if j >= nextsBeforeSeek {
1122 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1123 1 : }
1124 1 : kv = i.Next()
1125 : }
1126 1 : return kv
1127 : }
1128 :
1129 1 : func (i *Iter) nextPrefixV3(succKey []byte) *base.InternalKV {
1130 1 : // Doing nexts that involve a key comparison can be expensive (and the cost
1131 1 : // depends on the key length), so we use the same threshold of 3 that we use
1132 1 : // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
1133 1 : // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
1134 1 : // than doing a SeekGE within the block, so we do this 16 times
1135 1 : // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
1136 1 : // search for the restart consumes > 100ns. If the number of versions is >
1137 1 : // 17, we will increment nextFastCount to 17, then do a binary search, and
1138 1 : // on average need to find a key between two restarts, so another 8 steps
1139 1 : // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
1140 1 : // steps.
1141 1 : //
1142 1 : // TODO(sumeer): use the configured restartInterval for the sstable when it
1143 1 : // was written (which we don't currently store) instead of the default value
1144 1 : // of 16.
1145 1 : const nextCmpThresholdBeforeSeek = 3
1146 1 : const nextFastThresholdBeforeRestarts = 16
1147 1 : nextCmpCount := 0
1148 1 : nextFastCount := 0
1149 1 : usedRestarts := false
1150 1 : // INVARIANT: blockIter is valid.
1151 1 : if invariants.Enabled && !i.Valid() {
1152 0 : panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
1153 : }
1154 1 : prevKeyIsSet := i.ikv.Kind() == base.InternalKeyKindSet
1155 1 : for {
1156 1 : i.offset = i.nextOffset
1157 1 : if !i.Valid() {
1158 1 : return nil
1159 1 : }
1160 : // Need to decode the length integers, so we can compute nextOffset.
1161 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1162 1 : // This is an ugly performance hack. Reading entries from blocks is one of
1163 1 : // the inner-most routines and decoding the 3 varints per-entry takes
1164 1 : // significant time. Neither go1.11 or go1.12 will inline decodeVarint for
1165 1 : // us, so we do it manually. This provides a 10-15% performance improvement
1166 1 : // on blockIter benchmarks on both go1.11 and go1.12.
1167 1 : //
1168 1 : // TODO(peter): remove this hack if go:inline is ever supported.
1169 1 :
1170 1 : // Decode the shared key length integer.
1171 1 : var shared uint32
1172 1 : if a := *((*uint8)(ptr)); a < 128 {
1173 1 : shared = uint32(a)
1174 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1175 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1176 0 : shared = uint32(b)<<7 | uint32(a)
1177 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1178 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1179 0 : shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1180 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1181 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1182 0 : shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1183 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1184 0 : } else {
1185 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1186 0 : shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1187 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1188 0 : }
1189 : // Decode the unshared key length integer.
1190 1 : var unshared uint32
1191 1 : if a := *((*uint8)(ptr)); a < 128 {
1192 1 : unshared = uint32(a)
1193 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1194 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1195 0 : unshared = uint32(b)<<7 | uint32(a)
1196 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1197 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1198 0 : unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1199 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1200 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1201 0 : unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1202 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1203 0 : } else {
1204 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1205 0 : unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1206 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1207 0 : }
1208 : // Decode the value length integer.
1209 1 : var value uint32
1210 1 : if a := *((*uint8)(ptr)); a < 128 {
1211 1 : value = uint32(a)
1212 1 : ptr = unsafe.Pointer(uintptr(ptr) + 1)
1213 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1214 0 : value = uint32(b)<<7 | uint32(a)
1215 0 : ptr = unsafe.Pointer(uintptr(ptr) + 2)
1216 0 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1217 0 : value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1218 0 : ptr = unsafe.Pointer(uintptr(ptr) + 3)
1219 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1220 0 : value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1221 0 : ptr = unsafe.Pointer(uintptr(ptr) + 4)
1222 0 : } else {
1223 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1224 0 : value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
1225 0 : ptr = unsafe.Pointer(uintptr(ptr) + 5)
1226 0 : }
1227 1 : if i.transforms.SyntheticPrefix != nil {
1228 0 : shared += uint32(len(i.transforms.SyntheticPrefix))
1229 0 : }
1230 : // The starting position of the value.
1231 1 : valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1232 1 : i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
1233 1 : if invariants.Enabled && unshared < 8 {
1234 0 : // This should not happen since only the key prefix is shared, so even
1235 0 : // if the prefix length is the same as the user key length, the unshared
1236 0 : // will include the trailer.
1237 0 : panic(errors.AssertionFailedf("unshared %d is too small", unshared))
1238 : }
1239 : // The trailer is written in little endian, so the key kind is the first
1240 : // byte in the trailer that is encoded in the slice [unshared-8:unshared].
1241 1 : keyKind := base.InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
1242 1 : keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
1243 1 : prefixChanged := false
1244 1 : if keyKind == base.InternalKeyKindSet {
1245 1 : if invariants.Enabled && value == 0 {
1246 0 : panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
1247 : }
1248 1 : valPrefix := *((*block.ValuePrefix)(valuePtr))
1249 1 : if valPrefix.SetHasSamePrefix() {
1250 1 : // Fast-path. No need to assemble i.fullKey, or update i.key. We know
1251 1 : // that subsequent keys will not have a shared length that is greater
1252 1 : // than the prefix of the current key, which is also the prefix of
1253 1 : // i.key. Since we are continuing to iterate, we don't need to
1254 1 : // initialize i.ikey and i.lazyValue (these are initialized before
1255 1 : // returning).
1256 1 : nextFastCount++
1257 1 : if nextFastCount > nextFastThresholdBeforeRestarts {
1258 1 : if usedRestarts {
1259 0 : // Exhausted iteration budget. This will never happen unless
1260 0 : // someone is using a restart interval > 16. It is just to guard
1261 0 : // against long restart intervals causing too much iteration.
1262 0 : break
1263 : }
1264 : // Haven't used restarts yet, so find the first restart at or beyond
1265 : // the current offset.
1266 1 : targetOffset := i.offset
1267 1 : var index int32
1268 1 : {
1269 1 : // NB: manually inlined sort.Sort is ~5% faster.
1270 1 : //
1271 1 : // f defined for a restart point is true iff the offset >=
1272 1 : // targetOffset.
1273 1 : // Define f(-1) == false and f(i.numRestarts) == true.
1274 1 : // Invariant: f(index-1) == false, f(upper) == true.
1275 1 : upper := i.numRestarts
1276 1 : for index < upper {
1277 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1278 1 : // index ≤ h < upper
1279 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1280 1 : if offset < targetOffset {
1281 1 : index = h + 1 // preserves f(index-1) == false
1282 1 : } else {
1283 1 : upper = h // preserves f(upper) == true
1284 1 : }
1285 : }
1286 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1287 : // => answer is index.
1288 : }
1289 1 : usedRestarts = true
1290 1 : nextFastCount = 0
1291 1 : if index == i.numRestarts {
1292 1 : // Already past the last real restart, so iterate a bit more until
1293 1 : // we are done with the block.
1294 1 : continue
1295 : }
1296 : // Have some real restarts after index. NB: index is the first
1297 : // restart at or beyond the current offset.
1298 1 : startingIndex := index
1299 1 : for index != i.numRestarts &&
1300 1 : // The restart at index is 4 bytes written in little endian format
1301 1 : // starting at i.restart+4*index. The 0th byte is the least
1302 1 : // significant and the 3rd byte is the most significant. Since the
1303 1 : // most significant bit of the 3rd byte is what we use for
1304 1 : // encoding the set-has-same-prefix information, the indexing
1305 1 : // below has +3.
1306 1 : i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
1307 1 : // We still have the same prefix, so move to the next restart.
1308 1 : index++
1309 1 : }
1310 : // index is the first restart that did not have the same prefix.
1311 1 : if index != startingIndex {
1312 1 : // Managed to skip past at least one restart. Resume iteration
1313 1 : // from index-1. Since nextFastCount has been reset to 0, we
1314 1 : // should be able to iterate to the next prefix.
1315 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1316 1 : i.readEntry()
1317 1 : }
1318 : // Else, unable to skip past any restart. Resume iteration. Since
1319 : // nextFastCount has been reset to 0, we should be able to iterate
1320 : // to the next prefix.
1321 1 : continue
1322 : }
1323 1 : continue
1324 1 : } else if prevKeyIsSet {
1325 1 : prefixChanged = true
1326 1 : }
1327 1 : } else {
1328 1 : prevKeyIsSet = false
1329 1 : }
1330 : // Slow-path cases:
1331 : // - (Likely) The prefix has changed.
1332 : // - (Unlikely) The prefix has not changed.
1333 : // We assemble the key etc. under the assumption that it is the likely
1334 : // case.
1335 1 : unsharedKey := getBytes(ptr, int(unshared))
1336 1 : // TODO(sumeer): move this into the else block below. This is a bit tricky
1337 1 : // since the current logic assumes we have always copied the latest key
1338 1 : // into fullKey, which is why when we get to the next key we can (a)
1339 1 : // access i.fullKey[:shared], (b) append only the unsharedKey to
1340 1 : // i.fullKey. For (a), we can access i.key[:shared] since that memory is
1341 1 : // valid (even if unshared). For (b), we will need to remember whether
1342 1 : // i.key refers to i.fullKey or not, and can append the unsharedKey only
1343 1 : // in the former case and for the latter case need to copy the shared part
1344 1 : // too. This same comment applies to the other place where we can do this
1345 1 : // optimization, in readEntry().
1346 1 : i.fullKey = append(i.fullKey[:shared], unsharedKey...)
1347 1 : i.val = getBytes(valuePtr, int(value))
1348 1 : if shared == 0 {
1349 1 : // Provide stability for the key across positioning calls if the key
1350 1 : // doesn't share a prefix with the previous key. This removes requiring the
1351 1 : // key to be copied if the caller knows the block has a restart interval of
1352 1 : // 1. An important example of this is range-del blocks.
1353 1 : i.key = unsharedKey
1354 1 : } else {
1355 1 : i.key = i.fullKey
1356 1 : }
1357 : // Manually inlined version of i.decodeInternalKey(i.key).
1358 1 : hiddenPoint := false
1359 1 : if n := len(i.key) - 8; n >= 0 {
1360 1 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1361 1 : hiddenPoint = i.transforms.HideObsoletePoints &&
1362 1 : (trailer&TrailerObsoleteBit != 0)
1363 1 : i.ikv.K = base.InternalKey{
1364 1 : Trailer: trailer & TrailerObsoleteMask,
1365 1 : UserKey: i.key[:n:n],
1366 1 : }
1367 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1368 0 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1369 0 : }
1370 1 : if i.transforms.SyntheticSuffix.IsSet() {
1371 0 : // Inlined version of i.maybeReplaceSuffix()
1372 0 : prefixLen := i.split(i.ikv.K.UserKey)
1373 0 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1374 0 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1375 0 : i.ikv.K.UserKey = i.synthSuffixBuf
1376 0 : }
1377 0 : } else {
1378 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1379 0 : i.ikv.K.UserKey = nil
1380 0 : }
1381 1 : nextCmpCount++
1382 1 : if invariants.Enabled && prefixChanged && i.cmp(i.ikv.K.UserKey, succKey) < 0 {
1383 0 : panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
1384 0 : i.ikv.K.UserKey, succKey))
1385 : }
1386 1 : if prefixChanged || i.cmp(i.ikv.K.UserKey, succKey) >= 0 {
1387 1 : // Prefix has changed.
1388 1 : if hiddenPoint {
1389 0 : return i.Next()
1390 0 : }
1391 1 : if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
1392 0 : panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
1393 : }
1394 1 : if i.ikv.K.Kind() != base.InternalKeyKindSet {
1395 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1396 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1397 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1398 1 : } else {
1399 0 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1400 0 : }
1401 1 : return &i.ikv
1402 : }
1403 : // Else prefix has not changed.
1404 :
1405 1 : if nextCmpCount >= nextCmpThresholdBeforeSeek {
1406 1 : break
1407 : }
1408 : }
1409 1 : return i.SeekGE(succKey, base.SeekGEFlagsNone)
1410 : }
1411 :
1412 : // Prev implements internalIterator.Prev, as documented in the pebble
1413 : // package.
1414 1 : func (i *Iter) Prev() *base.InternalKV {
1415 1 : start:
1416 1 : for n := len(i.cached) - 1; n >= 0; n-- {
1417 1 : i.nextOffset = i.offset
1418 1 : e := &i.cached[n]
1419 1 : i.offset = e.offset
1420 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1421 1 : // Manually inlined version of i.decodeInternalKey(i.key).
1422 1 : i.key = i.cachedBuf[e.keyStart:e.keyEnd]
1423 1 : if n := len(i.key) - 8; n >= 0 {
1424 1 : trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(i.key[n:]))
1425 1 : hiddenPoint := i.transforms.HideObsoletePoints &&
1426 1 : (trailer&TrailerObsoleteBit != 0)
1427 1 : if hiddenPoint {
1428 1 : continue
1429 : }
1430 1 : i.ikv.K = base.InternalKey{
1431 1 : Trailer: trailer & TrailerObsoleteMask,
1432 1 : UserKey: i.key[:n:n],
1433 1 : }
1434 1 : if n := i.transforms.SyntheticSeqNum; n != 0 {
1435 1 : i.ikv.K.SetSeqNum(base.SeqNum(n))
1436 1 : }
1437 1 : if i.transforms.SyntheticSuffix.IsSet() {
1438 1 : // Inlined version of i.maybeReplaceSuffix()
1439 1 : prefixLen := i.split(i.ikv.K.UserKey)
1440 1 : // If ikey is cached or may get cached, we must de-reference
1441 1 : // UserKey before suffix replacement.
1442 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1443 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1444 1 : i.ikv.K.UserKey = i.synthSuffixBuf
1445 1 : }
1446 0 : } else {
1447 0 : i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid)
1448 0 : i.ikv.K.UserKey = nil
1449 0 : }
1450 1 : i.cached = i.cached[:n]
1451 1 : if !i.lazyValueHandling.hasValuePrefix ||
1452 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1453 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1454 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1455 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1456 1 : } else {
1457 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1458 1 : }
1459 1 : return &i.ikv
1460 : }
1461 :
1462 1 : i.clearCache()
1463 1 : if i.offset <= 0 {
1464 1 : i.offset = -1
1465 1 : i.nextOffset = 0
1466 1 : return nil
1467 1 : }
1468 :
1469 1 : targetOffset := i.offset
1470 1 : var index int32
1471 1 :
1472 1 : {
1473 1 : // NB: manually inlined sort.Sort is ~5% faster.
1474 1 : //
1475 1 : // Define f(-1) == false and f(n) == true.
1476 1 : // Invariant: f(index-1) == false, f(upper) == true.
1477 1 : upper := i.numRestarts
1478 1 : for index < upper {
1479 1 : h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
1480 1 : // index ≤ h < upper
1481 1 : offset := decodeRestart(i.data[i.restarts+4*h:])
1482 1 : if offset < targetOffset {
1483 1 : // Looking for the first restart that has offset >= targetOffset, so
1484 1 : // ignore h and earlier.
1485 1 : index = h + 1 // preserves f(i-1) == false
1486 1 : } else {
1487 1 : upper = h // preserves f(j) == true
1488 1 : }
1489 : }
1490 : // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
1491 : // => answer is index.
1492 : }
1493 :
1494 : // index is first restart with offset >= targetOffset. Note that
1495 : // targetOffset may not be at a restart point since one can call Prev()
1496 : // after Next() (so the cache was not populated) and targetOffset refers to
1497 : // the current entry. index-1 must have an offset < targetOffset (it can't
1498 : // be equal to targetOffset since the binary search would have selected that
1499 : // as the index).
1500 1 : i.offset = 0
1501 1 : if index > 0 {
1502 1 : i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
1503 1 : }
1504 : // TODO(sumeer): why is the else case not an error given targetOffset is a
1505 : // valid offset.
1506 :
1507 1 : i.readEntry()
1508 1 :
1509 1 : // We stop when i.nextOffset == targetOffset since the targetOffset is the
1510 1 : // entry we are stepping back from, and we don't need to cache the entry
1511 1 : // before it, since it is the candidate to return.
1512 1 : for i.nextOffset < targetOffset {
1513 1 : i.cacheEntry()
1514 1 : i.offset = i.nextOffset
1515 1 : i.readEntry()
1516 1 : }
1517 :
1518 1 : hiddenPoint := i.decodeInternalKey(i.key)
1519 1 : if hiddenPoint {
1520 1 : // Use the cache.
1521 1 : goto start
1522 : }
1523 1 : if i.transforms.SyntheticSuffix.IsSet() {
1524 1 : // Inlined version of i.maybeReplaceSuffix()
1525 1 : prefixLen := i.split(i.ikv.K.UserKey)
1526 1 : // If ikey is cached or may get cached, we must de-reference
1527 1 : // UserKey before suffix replacement.
1528 1 : i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...)
1529 1 : i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...)
1530 1 : i.ikv.K.UserKey = i.synthSuffixBuf
1531 1 : }
1532 1 : if !i.lazyValueHandling.hasValuePrefix ||
1533 1 : i.ikv.K.Kind() != base.InternalKeyKindSet {
1534 1 : i.ikv.V = base.MakeInPlaceValue(i.val)
1535 1 : } else if i.lazyValueHandling.getValue == nil || !block.ValuePrefix(i.val[0]).IsValueHandle() {
1536 1 : i.ikv.V = base.MakeInPlaceValue(i.val[1:])
1537 1 : } else {
1538 1 : i.ikv.V = i.lazyValueHandling.getValue.GetLazyValueForPrefixAndValueHandle(i.val)
1539 1 : }
1540 1 : return &i.ikv
1541 : }
1542 :
1543 : // Key returns the internal key at the current iterator position.
1544 0 : func (i *Iter) Key() *base.InternalKey {
1545 0 : return &i.ikv.K
1546 0 : }
1547 :
1548 : // KV returns the internal KV at the current iterator position.
1549 1 : func (i *Iter) KV() *base.InternalKV {
1550 1 : return &i.ikv
1551 1 : }
1552 :
1553 : // Value returns the value at the current iterator position.
1554 0 : func (i *Iter) Value() base.LazyValue {
1555 0 : return i.ikv.V
1556 0 : }
1557 :
1558 : // Error implements internalIterator.Error, as documented in the pebble
1559 : // package.
1560 1 : func (i *Iter) Error() error {
1561 1 : return nil // infallible
1562 1 : }
1563 :
1564 : // Close implements internalIterator.Close, as documented in the pebble
1565 : // package.
1566 1 : func (i *Iter) Close() error {
1567 1 : i.handle.Release()
1568 1 : i.handle = block.BufferHandle{}
1569 1 : i.val = nil
1570 1 : i.ikv = base.InternalKV{}
1571 1 : i.lazyValueHandling.getValue = nil
1572 1 : return nil
1573 1 : }
1574 :
1575 : // SetBounds implements base.InternalIterator. It panics, as bounds should
1576 : // always be handled the by the parent sstable iterator.
1577 0 : func (i *Iter) SetBounds(lower, upper []byte) {
1578 0 : // This should never be called as bounds are handled by sstable.Iterator.
1579 0 : panic("pebble: SetBounds unimplemented")
1580 : }
1581 :
1582 : // SetContext implements base.InternalIterator.
1583 0 : func (i *Iter) SetContext(_ context.Context) {}
1584 :
1585 : // Valid returns true if the iterator is currently positioned at a valid KV.
1586 1 : func (i *Iter) Valid() bool {
1587 1 : return i.offset >= 0 && i.offset < i.restarts
1588 1 : }
1589 :
1590 : // DebugTree is part of the InternalIterator interface.
1591 0 : func (i *Iter) DebugTree(tp treeprinter.Node) {
1592 0 : tp.Childf("%T(%p)", i, i)
1593 0 : }
1594 :
1595 1 : func (i *Iter) getRestart(idx int) int32 {
1596 1 : return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
1597 1 : }
1598 :
1599 1 : func (i *Iter) isRestartPoint() bool {
1600 1 : j := sort.Search(int(i.numRestarts), func(j int) bool {
1601 1 : return i.getRestart(j) >= i.offset
1602 1 : })
1603 1 : return j < int(i.numRestarts) && i.getRestart(j) == i.offset
1604 : }
1605 :
1606 : // DescribeKV is a function that formats a key-value pair, writing the
1607 : // description to w.
1608 : type DescribeKV func(w io.Writer, key *base.InternalKey, val []byte, enc KVEncoding)
1609 :
1610 : // KVEncoding describes the encoding of a key-value pair within the block.
1611 : type KVEncoding struct {
1612 : // IsRestart is true if the key is a restart point.
1613 : IsRestart bool
1614 : // Offset is the position within the block at which the key-value pair is
1615 : // encoded.
1616 : Offset int32
1617 : // Length is the total length of the KV pair as it is encoded in the block
1618 : // format.
1619 : Length int32
1620 : // KeyShared is the number of bytes this KV's user key shared with its predecessor.
1621 : KeyShared uint32
1622 : // KeyUnshared is the number of bytes this KV's user key did not share with
1623 : // its predecessor.
1624 : KeyUnshared uint32
1625 : // ValueLen is the length of the internal value.
1626 : ValueLen uint32
1627 : }
1628 :
1629 : // Describe describes the contents of a block, writing the description to w.
1630 : // It invokes fmtKV to describe each key-value pair.
1631 1 : func (i *Iter) Describe(w io.Writer, blkOffset uint64, fmtKV DescribeKV) {
1632 1 : for kv := i.First(); kv != nil; kv = i.Next() {
1633 1 : enc := KVEncoding{
1634 1 : IsRestart: i.isRestartPoint(),
1635 1 : Offset: i.offset,
1636 1 : Length: int32(i.nextOffset - i.offset),
1637 1 : }
1638 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1639 1 : enc.KeyShared, ptr = decodeVarint(ptr)
1640 1 : enc.KeyUnshared, ptr = decodeVarint(ptr)
1641 1 : enc.ValueLen, _ = decodeVarint(ptr)
1642 1 : fmtKV(w, &kv.K, kv.V.ValueOrHandle, enc)
1643 1 : }
1644 : // Format the restart points.
1645 1 : for j := 0; j < int(i.numRestarts); j++ {
1646 1 : offset := i.getRestart(j)
1647 1 : // TODO(jackson): This formatting seems bizarre. We're taking blkOffset
1648 1 : // which is an offset in the physical, compressed file, and adding the
1649 1 : // offset of the KV pair within the uncompressed block. We should just
1650 1 : // print offsets relative to the block start.
1651 1 : fmt.Fprintf(w, "%10d [restart %d]\n",
1652 1 : blkOffset+uint64(i.restarts+4*int32(j)), blkOffset+uint64(offset))
1653 1 : }
1654 : }
1655 :
1656 : // RawIter is an iterator over a single block of data. Unlike blockIter,
1657 : // keys are stored in "raw" format (i.e. not as internal keys). Note that there
1658 : // is significant similarity between this code and the code in blockIter. Yet
1659 : // reducing duplication is difficult due to the blockIter being performance
1660 : // critical. RawIter must only be used for blocks where the value is
1661 : // stored together with the key.
1662 : type RawIter struct {
1663 : cmp base.Compare
1664 : offset int32
1665 : nextOffset int32
1666 : restarts int32
1667 : numRestarts int32
1668 : ptr unsafe.Pointer
1669 : data []byte
1670 : key, val []byte
1671 : ikey base.InternalKey
1672 : cached []blockEntry
1673 : cachedBuf []byte
1674 : }
1675 :
1676 : // NewRawIter constructs a new raw block iterator.
1677 1 : func NewRawIter(cmp base.Compare, block []byte) (*RawIter, error) {
1678 1 : i := &RawIter{}
1679 1 : return i, i.Init(cmp, block)
1680 1 : }
1681 :
1682 : // Init initializes the raw block iterator.
1683 1 : func (i *RawIter) Init(cmp base.Compare, blk []byte) error {
1684 1 : numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:]))
1685 1 : if numRestarts == 0 {
1686 0 : return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
1687 0 : }
1688 1 : i.cmp = cmp
1689 1 : i.restarts = int32(len(blk)) - 4*(1+numRestarts)
1690 1 : i.numRestarts = numRestarts
1691 1 : i.ptr = unsafe.Pointer(&blk[0])
1692 1 : i.data = blk
1693 1 : if i.key == nil {
1694 1 : i.key = make([]byte, 0, 256)
1695 1 : } else {
1696 0 : i.key = i.key[:0]
1697 0 : }
1698 1 : i.val = nil
1699 1 : i.clearCache()
1700 1 : return nil
1701 : }
1702 :
1703 1 : func (i *RawIter) readEntry() {
1704 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1705 1 : shared, ptr := decodeVarint(ptr)
1706 1 : unshared, ptr := decodeVarint(ptr)
1707 1 : value, ptr := decodeVarint(ptr)
1708 1 : i.key = append(i.key[:shared], getBytes(ptr, int(unshared))...)
1709 1 : i.key = i.key[:len(i.key):len(i.key)]
1710 1 : ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
1711 1 : i.val = getBytes(ptr, int(value))
1712 1 : i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
1713 1 : }
1714 :
1715 1 : func (i *RawIter) loadEntry() {
1716 1 : i.readEntry()
1717 1 : i.ikey.UserKey = i.key
1718 1 : }
1719 :
1720 1 : func (i *RawIter) clearCache() {
1721 1 : i.cached = i.cached[:0]
1722 1 : i.cachedBuf = i.cachedBuf[:0]
1723 1 : }
1724 :
1725 1 : func (i *RawIter) cacheEntry() {
1726 1 : var valStart int32
1727 1 : valSize := int32(len(i.val))
1728 1 : if valSize > 0 {
1729 0 : valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
1730 0 : }
1731 :
1732 1 : i.cached = append(i.cached, blockEntry{
1733 1 : offset: i.offset,
1734 1 : keyStart: int32(len(i.cachedBuf)),
1735 1 : keyEnd: int32(len(i.cachedBuf) + len(i.key)),
1736 1 : valStart: valStart,
1737 1 : valSize: valSize,
1738 1 : })
1739 1 : i.cachedBuf = append(i.cachedBuf, i.key...)
1740 : }
1741 :
1742 : // SeekGE implements internalIterator.SeekGE, as documented in the pebble
1743 : // package.
1744 1 : func (i *RawIter) SeekGE(key []byte) bool {
1745 1 : // Find the index of the smallest restart point whose key is > the key
1746 1 : // sought; index will be numRestarts if there is no such restart point.
1747 1 : i.offset = 0
1748 1 : index := sort.Search(int(i.numRestarts), func(j int) bool {
1749 1 : offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
1750 1 : // For a restart point, there are 0 bytes shared with the previous key.
1751 1 : // The varint encoding of 0 occupies 1 byte.
1752 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
1753 1 : // Decode the key at that restart point, and compare it to the key sought.
1754 1 : v1, ptr := decodeVarint(ptr)
1755 1 : _, ptr = decodeVarint(ptr)
1756 1 : s := getBytes(ptr, int(v1))
1757 1 : return i.cmp(key, s) < 0
1758 1 : })
1759 :
1760 : // Since keys are strictly increasing, if index > 0 then the restart point at
1761 : // index-1 will be the largest whose key is <= the key sought. If index ==
1762 : // 0, then all keys in this block are larger than the key sought, and offset
1763 : // remains at zero.
1764 1 : if index > 0 {
1765 1 : i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
1766 1 : }
1767 1 : i.loadEntry()
1768 1 :
1769 1 : // Iterate from that restart point to somewhere >= the key sought.
1770 1 : for valid := i.Valid(); valid; valid = i.Next() {
1771 1 : if i.cmp(key, i.key) <= 0 {
1772 1 : break
1773 : }
1774 : }
1775 1 : return i.Valid()
1776 : }
1777 :
1778 : // First implements internalIterator.First, as documented in the pebble
1779 : // package.
1780 1 : func (i *RawIter) First() bool {
1781 1 : i.offset = 0
1782 1 : i.loadEntry()
1783 1 : return i.Valid()
1784 1 : }
1785 :
1786 : // Last implements internalIterator.Last, as documented in the pebble package.
1787 1 : func (i *RawIter) Last() bool {
1788 1 : // Seek forward from the last restart point.
1789 1 : i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):]))
1790 1 :
1791 1 : i.readEntry()
1792 1 : i.clearCache()
1793 1 : i.cacheEntry()
1794 1 :
1795 1 : for i.nextOffset < i.restarts {
1796 1 : i.offset = i.nextOffset
1797 1 : i.readEntry()
1798 1 : i.cacheEntry()
1799 1 : }
1800 :
1801 1 : i.ikey.UserKey = i.key
1802 1 : return i.Valid()
1803 : }
1804 :
1805 : // Next implements internalIterator.Next, as documented in the pebble
1806 : // package.
1807 1 : func (i *RawIter) Next() bool {
1808 1 : i.offset = i.nextOffset
1809 1 : if !i.Valid() {
1810 1 : return false
1811 1 : }
1812 1 : i.loadEntry()
1813 1 : return true
1814 : }
1815 :
1816 : // Prev implements internalIterator.Prev, as documented in the pebble
1817 : // package.
1818 1 : func (i *RawIter) Prev() bool {
1819 1 : if n := len(i.cached) - 1; n > 0 && i.cached[n].offset == i.offset {
1820 1 : i.nextOffset = i.offset
1821 1 : e := &i.cached[n-1]
1822 1 : i.offset = e.offset
1823 1 : i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
1824 1 : i.ikey.UserKey = i.cachedBuf[e.keyStart:e.keyEnd]
1825 1 : i.cached = i.cached[:n]
1826 1 : return true
1827 1 : }
1828 :
1829 1 : if i.offset == 0 {
1830 1 : i.offset = -1
1831 1 : i.nextOffset = 0
1832 1 : return false
1833 1 : }
1834 :
1835 0 : targetOffset := i.offset
1836 0 : index := sort.Search(int(i.numRestarts), func(j int) bool {
1837 0 : offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
1838 0 : return offset >= targetOffset
1839 0 : })
1840 0 : i.offset = 0
1841 0 : if index > 0 {
1842 0 : i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
1843 0 : }
1844 :
1845 0 : i.readEntry()
1846 0 : i.clearCache()
1847 0 : i.cacheEntry()
1848 0 :
1849 0 : for i.nextOffset < targetOffset {
1850 0 : i.offset = i.nextOffset
1851 0 : i.readEntry()
1852 0 : i.cacheEntry()
1853 0 : }
1854 :
1855 0 : i.ikey.UserKey = i.key
1856 0 : return true
1857 : }
1858 :
1859 : // Key implements internalIterator.Key, as documented in the pebble package.
1860 1 : func (i *RawIter) Key() base.InternalKey {
1861 1 : return i.ikey
1862 1 : }
1863 :
1864 : // Value implements internalIterator.Value, as documented in the pebble
1865 : // package.
1866 1 : func (i *RawIter) Value() []byte {
1867 1 : return i.val
1868 1 : }
1869 :
1870 : // Valid implements internalIterator.Valid, as documented in the pebble
1871 : // package.
1872 1 : func (i *RawIter) Valid() bool {
1873 1 : return i.offset >= 0 && i.offset < i.restarts
1874 1 : }
1875 :
1876 : // Error implements internalIterator.Error, as documented in the pebble
1877 : // package.
1878 0 : func (i *RawIter) Error() error {
1879 0 : return nil
1880 0 : }
1881 :
1882 : // Close implements internalIterator.Close, as documented in the pebble
1883 : // package.
1884 1 : func (i *RawIter) Close() error {
1885 1 : i.val = nil
1886 1 : return nil
1887 1 : }
1888 :
1889 : // DebugTree is part of the InternalIterator interface.
1890 0 : func (i *RawIter) DebugTree(tp treeprinter.Node) {
1891 0 : tp.Childf("%T(%p)", i, i)
1892 0 : }
1893 :
1894 1 : func (i *RawIter) getRestart(idx int) int32 {
1895 1 : return int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*int32(idx):]))
1896 1 : }
1897 :
1898 1 : func (i *RawIter) isRestartPoint() bool {
1899 1 : j := sort.Search(int(i.numRestarts), func(j int) bool {
1900 1 : return i.getRestart(j) >= i.offset
1901 1 : })
1902 1 : return j < int(i.numRestarts) && i.getRestart(j) == i.offset
1903 : }
1904 :
1905 : // Describe describes the contents of a block, writing the description to w.
1906 : // It invokes fmtKV to describe each key-value pair.
1907 1 : func (i *RawIter) Describe(w io.Writer, blkOffset uint64, fmtKV DescribeKV) {
1908 1 : for valid := i.First(); valid; valid = i.Next() {
1909 1 : enc := KVEncoding{
1910 1 : IsRestart: i.isRestartPoint(),
1911 1 : Offset: i.offset,
1912 1 : Length: int32(i.nextOffset - i.offset),
1913 1 : }
1914 1 : ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
1915 1 : enc.KeyShared, ptr = decodeVarint(ptr)
1916 1 : enc.KeyUnshared, ptr = decodeVarint(ptr)
1917 1 : enc.ValueLen, _ = decodeVarint(ptr)
1918 1 : fmtKV(w, &i.ikey, i.val, enc)
1919 1 : if i.isRestartPoint() {
1920 1 : fmt.Fprintf(w, " [restart]\n")
1921 1 : } else {
1922 1 : fmt.Fprintf(w, "\n")
1923 1 : }
1924 : }
1925 : // Format the restart points.
1926 1 : for j := 0; j < int(i.numRestarts); j++ {
1927 1 : offset := i.getRestart(j)
1928 1 : // TODO(jackson): This formatting seems bizarre. We're taking blkOffset
1929 1 : // which is an offset in the physical, compressed file, and adding the
1930 1 : // offset of the KV pair within the uncompressed block. We should just
1931 1 : // print offsets relative to the block start.
1932 1 : fmt.Fprintf(w, "%10d [restart %d]\n",
1933 1 : blkOffset+uint64(i.restarts+4*int32(j)), blkOffset+uint64(offset))
1934 1 : }
1935 : }
1936 :
1937 1 : func getBytes(ptr unsafe.Pointer, length int) []byte {
1938 1 : return (*[manual.MaxArrayLen]byte)(ptr)[:length:length]
1939 1 : }
1940 :
1941 1 : func decodeVarint(ptr unsafe.Pointer) (uint32, unsafe.Pointer) {
1942 1 : if a := *((*uint8)(ptr)); a < 128 {
1943 1 : return uint32(a),
1944 1 : unsafe.Pointer(uintptr(ptr) + 1)
1945 1 : } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
1946 1 : return uint32(b)<<7 | uint32(a),
1947 1 : unsafe.Pointer(uintptr(ptr) + 2)
1948 1 : } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
1949 0 : return uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1950 0 : unsafe.Pointer(uintptr(ptr) + 3)
1951 0 : } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
1952 0 : return uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1953 0 : unsafe.Pointer(uintptr(ptr) + 4)
1954 0 : } else {
1955 0 : d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
1956 0 : return uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
1957 0 : unsafe.Pointer(uintptr(ptr) + 5)
1958 0 : }
1959 : }
|