Line data Source code
1 : // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : package sstable
6 :
7 : import (
8 : "fmt"
9 : "os"
10 : "sync"
11 :
12 : "github.com/cockroachdb/pebble/internal/base"
13 : "github.com/cockroachdb/pebble/internal/invariants"
14 : )
15 :
16 : // Iterator iterates over an entire table of data.
17 : type Iterator interface {
18 : base.InternalIterator
19 :
20 : // NextPrefix implements (base.InternalIterator).NextPrefix.
21 : NextPrefix(succKey []byte) (*InternalKey, base.LazyValue)
22 :
23 : // MaybeFilteredKeys may be called when an iterator is exhausted to indicate
24 : // whether or not the last positioning method may have skipped any keys due
25 : // to block-property filters. This is used by the Pebble levelIter to
26 : // control when an iterator steps to the next sstable.
27 : //
28 : // MaybeFilteredKeys may always return false positives, that is it may
29 : // return true when no keys were filtered. It should only be called when the
30 : // iterator is exhausted. It must never return false negatives when the
31 : // iterator is exhausted.
32 : MaybeFilteredKeys() bool
33 :
34 : SetCloseHook(fn func(i Iterator) error)
35 : }
36 :
37 : // Iterator positioning optimizations and singleLevelIterator and
38 : // twoLevelIterator:
39 : //
40 : // An iterator is absolute positioned using one of the Seek or First or Last
41 : // calls. After absolute positioning, there can be relative positioning done
42 : // by stepping using Prev or Next.
43 : //
44 : // We implement optimizations below where an absolute positioning call can in
45 : // some cases use the current position to do less work. To understand these,
46 : // we first define some terms. An iterator is bounds-exhausted if the bounds
47 : // (upper of lower) have been reached. An iterator is data-exhausted if it has
48 : // the reached the end of the data (forward or reverse) in the sstable. A
49 : // singleLevelIterator only knows a local-data-exhausted property since when
50 : // it is used as part of a twoLevelIterator, the twoLevelIterator can step to
51 : // the next lower-level index block.
52 : //
53 : // The bounds-exhausted property is tracked by
54 : // singleLevelIterator.exhaustedBounds being +1 (upper bound reached) or -1
55 : // (lower bound reached). The same field is reused by twoLevelIterator. Either
56 : // may notice the exhaustion of the bound and set it. Note that if
57 : // singleLevelIterator sets this property, it is not a local property (since
58 : // the bound has been reached regardless of whether this is in the context of
59 : // the twoLevelIterator or not).
60 : //
61 : // The data-exhausted property is tracked in a more subtle manner. We define
62 : // two predicates:
63 : // - partial-local-data-exhausted (PLDE):
64 : // i.data.isDataInvalidated() || !i.data.valid()
65 : // - partial-global-data-exhausted (PGDE):
66 : // i.index.isDataInvalidated() || !i.index.valid() || i.data.isDataInvalidated() ||
67 : // !i.data.valid()
68 : //
69 : // PLDE is defined for a singleLevelIterator. PGDE is defined for a
70 : // twoLevelIterator. Oddly, in our code below the singleLevelIterator does not
71 : // know when it is part of a twoLevelIterator so it does not know when its
72 : // property is local or global.
73 : //
74 : // Now to define data-exhausted:
75 : // - Prerequisite: we must know that the iterator has been positioned and
76 : // i.err is nil.
77 : // - bounds-exhausted must not be true:
78 : // If bounds-exhausted is true, we have incomplete knowledge of
79 : // data-exhausted since PLDE or PGDE could be true because we could have
80 : // chosen not to load index block or data block and figured out that the
81 : // bound is exhausted (due to block property filters filtering out index and
82 : // data blocks and going past the bound on the top level index block). Note
83 : // that if we tried to separate out the BPF case from others we could
84 : // develop more knowledge here.
85 : // - PGDE is true for twoLevelIterator. PLDE is true if it is a standalone
86 : // singleLevelIterator. !PLDE or !PGDE of course imply that data-exhausted
87 : // is not true.
88 : //
89 : // An implication of the above is that if we are going to somehow utilize
90 : // knowledge of data-exhausted in an optimization, we must not forget the
91 : // existing value of bounds-exhausted since by forgetting the latter we can
92 : // erroneously think that data-exhausted is true. Bug #2036 was due to this
93 : // forgetting.
94 : //
95 : // Now to the two categories of optimizations we currently have:
96 : // - Monotonic bounds optimization that reuse prior iterator position when
97 : // doing seek: These only work with !data-exhausted. We could choose to make
98 : // these work with data-exhausted but have not bothered because in the
99 : // context of a DB if data-exhausted were true, the DB would move to the
100 : // next file in the level. Note that this behavior of moving to the next
101 : // file is not necessarily true for L0 files, so there could be some benefit
102 : // in the future in this optimization. See the WARNING-data-exhausted
103 : // comments if trying to optimize this in the future.
104 : // - TrySeekUsingNext optimizations: these work regardless of exhaustion
105 : // state.
106 : //
107 : // Implementation detail: In the code PLDE only checks that
108 : // i.data.isDataInvalidated(). This narrower check is safe, since this is a
109 : // subset of the set expressed by the OR expression. Also, it is not a
110 : // de-optimization since whenever we exhaust the iterator we explicitly call
111 : // i.data.invalidate(). PGDE checks i.index.isDataInvalidated() &&
112 : // i.data.isDataInvalidated(). Again, this narrower check is safe, and not a
113 : // de-optimization since whenever we exhaust the iterator we explicitly call
114 : // i.index.invalidate() and i.data.invalidate(). The && is questionable -- for
115 : // now this is a bit of defensive code. We should seriously consider removing
116 : // it, since defensive code suggests we are not confident about our invariants
117 : // (and if we are not confident, we need more invariant assertions, not
118 : // defensive code).
119 : //
120 : // TODO(sumeer): remove the aforementioned defensive code.
121 :
122 : var singleLevelIterPool = sync.Pool{
123 2 : New: func() interface{} {
124 2 : i := &singleLevelIterator{}
125 2 : // Note: this is a no-op if invariants are disabled or race is enabled.
126 2 : invariants.SetFinalizer(i, checkSingleLevelIterator)
127 2 : return i
128 2 : },
129 : }
130 :
131 : var twoLevelIterPool = sync.Pool{
132 2 : New: func() interface{} {
133 2 : i := &twoLevelIterator{}
134 2 : // Note: this is a no-op if invariants are disabled or race is enabled.
135 2 : invariants.SetFinalizer(i, checkTwoLevelIterator)
136 2 : return i
137 2 : },
138 : }
139 :
140 : // TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some
141 : // code paths that double Close the iters. Fix the double close and pool the
142 : // *fragmentBlockIter type directly.
143 :
144 : var rangeKeyFragmentBlockIterPool = sync.Pool{
145 2 : New: func() interface{} {
146 2 : i := &rangeKeyFragmentBlockIter{}
147 2 : // Note: this is a no-op if invariants are disabled or race is enabled.
148 2 : invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator)
149 2 : return i
150 2 : },
151 : }
152 :
153 2 : func checkSingleLevelIterator(obj interface{}) {
154 2 : i := obj.(*singleLevelIterator)
155 2 : if p := i.data.handle.Get(); p != nil {
156 0 : fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
157 0 : os.Exit(1)
158 0 : }
159 2 : if p := i.index.handle.Get(); p != nil {
160 0 : fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
161 0 : os.Exit(1)
162 0 : }
163 : }
164 :
165 2 : func checkTwoLevelIterator(obj interface{}) {
166 2 : i := obj.(*twoLevelIterator)
167 2 : if p := i.data.handle.Get(); p != nil {
168 0 : fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
169 0 : os.Exit(1)
170 0 : }
171 2 : if p := i.index.handle.Get(); p != nil {
172 0 : fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
173 0 : os.Exit(1)
174 0 : }
175 : }
176 :
177 2 : func checkRangeKeyFragmentBlockIterator(obj interface{}) {
178 2 : i := obj.(*rangeKeyFragmentBlockIter)
179 2 : if p := i.blockIter.handle.Get(); p != nil {
180 0 : fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.handle is not nil: %p\n", p)
181 0 : os.Exit(1)
182 0 : }
183 : }
184 :
185 : // compactionIterator is similar to Iterator but it increments the number of
186 : // bytes that have been iterated through.
187 : type compactionIterator struct {
188 : *singleLevelIterator
189 : bytesIterated *uint64
190 : prevOffset uint64
191 : }
192 :
193 : // compactionIterator implements the base.InternalIterator interface.
194 : var _ base.InternalIterator = (*compactionIterator)(nil)
195 :
196 0 : func (i *compactionIterator) String() string {
197 0 : if i.vState != nil {
198 0 : return i.vState.fileNum.String()
199 0 : }
200 0 : return i.reader.fileNum.String()
201 : }
202 :
203 : func (i *compactionIterator) SeekGE(
204 : key []byte, flags base.SeekGEFlags,
205 0 : ) (*InternalKey, base.LazyValue) {
206 0 : panic("pebble: SeekGE unimplemented")
207 : }
208 :
209 : func (i *compactionIterator) SeekPrefixGE(
210 : prefix, key []byte, flags base.SeekGEFlags,
211 0 : ) (*base.InternalKey, base.LazyValue) {
212 0 : panic("pebble: SeekPrefixGE unimplemented")
213 : }
214 :
215 : func (i *compactionIterator) SeekLT(
216 : key []byte, flags base.SeekLTFlags,
217 0 : ) (*InternalKey, base.LazyValue) {
218 0 : panic("pebble: SeekLT unimplemented")
219 : }
220 :
221 2 : func (i *compactionIterator) First() (*InternalKey, base.LazyValue) {
222 2 : i.err = nil // clear cached iteration error
223 2 : return i.skipForward(i.singleLevelIterator.First())
224 2 : }
225 :
226 0 : func (i *compactionIterator) Last() (*InternalKey, base.LazyValue) {
227 0 : panic("pebble: Last unimplemented")
228 : }
229 :
230 : // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
231 : // due to performance. Keep the two in sync.
232 2 : func (i *compactionIterator) Next() (*InternalKey, base.LazyValue) {
233 2 : if i.err != nil {
234 0 : return nil, base.LazyValue{}
235 0 : }
236 2 : return i.skipForward(i.data.Next())
237 : }
238 :
239 0 : func (i *compactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
240 0 : panic("pebble: NextPrefix unimplemented")
241 : }
242 :
243 0 : func (i *compactionIterator) Prev() (*InternalKey, base.LazyValue) {
244 0 : panic("pebble: Prev unimplemented")
245 : }
246 :
247 : func (i *compactionIterator) skipForward(
248 : key *InternalKey, val base.LazyValue,
249 2 : ) (*InternalKey, base.LazyValue) {
250 2 : if key == nil {
251 2 : for {
252 2 : if key, _ := i.index.Next(); key == nil {
253 2 : break
254 : }
255 2 : result := i.loadBlock(+1)
256 2 : if result != loadBlockOK {
257 0 : if i.err != nil {
258 0 : break
259 : }
260 0 : switch result {
261 0 : case loadBlockFailed:
262 0 : // We checked that i.index was at a valid entry, so
263 0 : // loadBlockFailed could not have happened due to to i.index
264 0 : // being exhausted, and must be due to an error.
265 0 : panic("loadBlock should not have failed with no error")
266 0 : case loadBlockIrrelevant:
267 0 : panic("compactionIter should not be using block intervals for skipping")
268 0 : default:
269 0 : panic(fmt.Sprintf("unexpected case %d", result))
270 : }
271 : }
272 : // result == loadBlockOK
273 2 : if key, val = i.data.First(); key != nil {
274 2 : break
275 : }
276 : }
277 : }
278 :
279 2 : curOffset := i.recordOffset()
280 2 : *i.bytesIterated += uint64(curOffset - i.prevOffset)
281 2 : i.prevOffset = curOffset
282 2 :
283 2 : // We have an upper bound when the table is virtual.
284 2 : if i.upper != nil && key != nil {
285 2 : cmp := i.cmp(key.UserKey, i.upper)
286 2 : if cmp > 0 || (!i.endKeyInclusive && cmp == 0) {
287 2 : return nil, base.LazyValue{}
288 2 : }
289 : }
290 :
291 2 : return key, val
292 : }
|