/src/rocksdb/include/rocksdb/merge_operator.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | |
6 | | #pragma once |
7 | | |
8 | | #include <deque> |
9 | | #include <memory> |
10 | | #include <string> |
11 | | #include <utility> |
12 | | #include <variant> |
13 | | #include <vector> |
14 | | |
15 | | #include "rocksdb/customizable.h" |
16 | | #include "rocksdb/slice.h" |
17 | | #include "rocksdb/wide_columns.h" |
18 | | |
19 | | namespace ROCKSDB_NAMESPACE { |
20 | | |
21 | | class Slice; |
22 | | class Logger; |
23 | | |
24 | | // The Merge Operator |
25 | | // |
26 | | // Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only |
27 | | // client knows. It could be numeric addition, list append, string |
28 | | // concatenation, edit data structure, ... , anything. |
29 | | // The library, on the other hand, is concerned with the exercise of this |
30 | | // interface, at the right time (during get, iteration, compaction...) |
31 | | // |
32 | | // To use merge, the client needs to provide an object implementing one of |
33 | | // the following interfaces: |
34 | | // a) AssociativeMergeOperator - for most simple semantics (always take |
35 | | // two values, and merge them into one value, which is then put back |
36 | | // into rocksdb); numeric addition and string concatenation are examples; |
37 | | // |
38 | | // b) MergeOperator - the generic class for all the more abstract / complex |
39 | | // operations; one method (FullMergeV3) to merge a Put/Delete value with a |
40 | | // merge operand; and another method (PartialMerge) that merges multiple |
41 | | // operands together. this is especially useful if your key values have |
42 | | // complex structures but you would still like to support client-specific |
43 | | // incremental updates. |
44 | | // |
45 | | // AssociativeMergeOperator is simpler to implement. MergeOperator is simply |
46 | | // more powerful. |
47 | | // |
48 | | // Refer to rocksdb-merge wiki for more details and example implementations. |
49 | | // |
50 | | // Exceptions MUST NOT propagate out of overridden functions into RocksDB, |
51 | | // because RocksDB is not exception-safe. This could cause undefined behavior |
52 | | // including data loss, unreported corruption, deadlocks, and more. |
53 | | class MergeOperator : public Customizable { |
54 | | public: |
55 | 0 | virtual ~MergeOperator() {} |
56 | 16 | static const char* Type() { return "MergeOperator"; } |
57 | | static Status CreateFromString(const ConfigOptions& opts, |
58 | | const std::string& id, |
59 | | std::shared_ptr<MergeOperator>* result); |
60 | | |
61 | | // Gives the client a way to express the read -> modify -> write semantics |
62 | | // key: (IN) The key that's associated with this merge operation. |
63 | | // Client could multiplex the merge operator based on it |
64 | | // if the key space is partitioned and different subspaces |
65 | | // refer to different types of data which have different |
66 | | // merge operation semantics |
67 | | // existing: (IN) null indicates that the key does not exist before this op |
68 | | // operand_list:(IN) the sequence of merge operations to apply, front() first. |
69 | | // new_value:(OUT) Client is responsible for filling the merge result here. |
70 | | // The string that new_value is pointing to will be empty. |
71 | | // logger: (IN) Client could use this to log errors during merge. |
72 | | // |
73 | | // Return true on success. |
74 | | // All values passed in will be client-specific values. So if this method |
75 | | // returns false, it is because client specified bad data or there was |
76 | | // internal corruption. This will be treated as an error by the library. |
77 | | // |
78 | | // Also make use of the *logger for error messages. |
79 | | virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, |
80 | | const std::deque<std::string>& /*operand_list*/, |
81 | 0 | std::string* /*new_value*/, Logger* /*logger*/) const { |
82 | | // deprecated, please use FullMergeV2() |
83 | 0 | assert(false); |
84 | 0 | return false; |
85 | 0 | } |
86 | | |
87 | | struct MergeOperationInput { |
88 | | // If user-defined timestamp is enabled, `_key` includes timestamp. |
89 | | explicit MergeOperationInput(const Slice& _key, |
90 | | const Slice* _existing_value, |
91 | | const std::vector<Slice>& _operand_list, |
92 | | Logger* _logger) |
93 | 0 | : key(_key), |
94 | 0 | existing_value(_existing_value), |
95 | 0 | operand_list(_operand_list), |
96 | 0 | logger(_logger) {} |
97 | | |
98 | | // The key associated with the merge operation. |
99 | | const Slice& key; |
100 | | // The existing value of the current key, nullptr means that the |
101 | | // value doesn't exist. |
102 | | const Slice* existing_value; |
103 | | // A list of operands to apply. |
104 | | const std::vector<Slice>& operand_list; |
105 | | // Logger could be used by client to log any errors that happen during |
106 | | // the merge operation. |
107 | | Logger* logger; |
108 | | }; |
109 | | |
110 | | enum class OpFailureScope { |
111 | | kDefault, |
112 | | kTryMerge, |
113 | | kMustMerge, |
114 | | kOpFailureScopeMax, |
115 | | }; |
116 | | |
117 | | struct MergeOperationOutput { |
118 | | explicit MergeOperationOutput(std::string& _new_value, |
119 | | Slice& _existing_operand) |
120 | 0 | : new_value(_new_value), existing_operand(_existing_operand) {} |
121 | | |
122 | | // Client is responsible for filling the merge result here. |
123 | | std::string& new_value; |
124 | | // If the merge result is one of the existing operands (or existing_value), |
125 | | // client can set this field to the operand (or existing_value) instead of |
126 | | // using new_value. |
127 | | Slice& existing_operand; |
128 | | // Indicates the blast radius of the failure. It is only meaningful to |
129 | | // provide a failure scope when returning `false` from the API populating |
130 | | // the `MergeOperationOutput`. Currently RocksDB operations handle these |
131 | | // values as follows: |
132 | | // |
133 | | // - `OpFailureScope::kDefault`: fallback to default |
134 | | // (`OpFailureScope::kTryMerge`) |
135 | | // - `OpFailureScope::kTryMerge`: operations that try to merge that key will |
136 | | // fail. This includes flush and compaction, which puts the DB in |
137 | | // read-only mode. |
138 | | // - `OpFailureScope::kMustMerge`: operations that must merge that key will |
139 | | // fail (e.g., `Get()`, `MultiGet()`, iteration). Flushes/compactions can |
140 | | // still proceed by copying the original input operands to the output. |
141 | | OpFailureScope op_failure_scope = OpFailureScope::kDefault; |
142 | | }; |
143 | | |
144 | | // This function applies a stack of merge operands in chronological order |
145 | | // on top of an existing value. There are two ways in which this method is |
146 | | // being used: |
147 | | // a) During Get() operation, it used to calculate the final value of a key |
148 | | // b) During compaction, in order to collapse some operands with the based |
149 | | // value. |
150 | | // |
151 | | // Note: The name of the method is somewhat misleading, as both in the cases |
152 | | // of Get() or compaction it may be called on a subset of operands: |
153 | | // K: 0 +1 +2 +7 +4 +5 2 +1 +2 |
154 | | // ^ |
155 | | // | |
156 | | // snapshot |
157 | | // In the example above, Get(K) operation will call FullMerge with a base |
158 | | // value of 2 and operands [+1, +2]. Compaction process might decide to |
159 | | // collapse the beginning of the history up to the snapshot by performing |
160 | | // full Merge with base value of 0 and operands [+1, +2, +7, +4]. |
161 | | virtual bool FullMergeV2(const MergeOperationInput& merge_in, |
162 | | MergeOperationOutput* merge_out) const; |
163 | | |
164 | | struct MergeOperationInputV3 { |
165 | | using ExistingValue = std::variant<std::monostate, Slice, WideColumns>; |
166 | | using OperandList = std::vector<Slice>; |
167 | | |
168 | | explicit MergeOperationInputV3(const Slice& _key, |
169 | | ExistingValue&& _existing_value, |
170 | | const OperandList& _operand_list, |
171 | | Logger* _logger) |
172 | 0 | : key(_key), |
173 | 0 | existing_value(std::move(_existing_value)), |
174 | 0 | operand_list(_operand_list), |
175 | 0 | logger(_logger) {} |
176 | | |
177 | | // The user key, including the user-defined timestamp if applicable. |
178 | | const Slice& key; |
179 | | // The base value of the merge operation. Can be one of three things (see |
180 | | // the ExistingValue variant above): no existing value, plain existing |
181 | | // value, or wide-column existing value. |
182 | | ExistingValue existing_value; |
183 | | // The list of operands to apply. |
184 | | const OperandList& operand_list; |
185 | | // The logger to use in case a failure happens during the merge operation. |
186 | | Logger* logger; |
187 | | }; |
188 | | |
189 | | struct MergeOperationOutputV3 { |
190 | | using NewColumns = std::vector<std::pair<std::string, std::string>>; |
191 | | using NewValue = std::variant<std::string, NewColumns, Slice>; |
192 | | |
193 | | // The result of the merge operation. Can be one of three things (see the |
194 | | // NewValue variant above): a new plain value, a new wide-column value, or |
195 | | // an existing merge operand. |
196 | | NewValue new_value; |
197 | | // The scope of the failure if applicable. See above for more details. |
198 | | OpFailureScope op_failure_scope = OpFailureScope::kDefault; |
199 | | }; |
200 | | |
201 | | // An extended version of FullMergeV2() that supports wide columns on both the |
202 | | // input and the output side, enabling the application to perform general |
203 | | // transformations during merges. For backward compatibility, the default |
204 | | // implementation calls FullMergeV2(). Specifically, if there is no base value |
205 | | // or the base value is a plain key-value, the default implementation falls |
206 | | // back to FullMergeV2(). If the base value is a wide-column entity, the |
207 | | // default implementation invokes FullMergeV2() to perform the merge on the |
208 | | // default column, and leaves any other columns unchanged. |
209 | | virtual bool FullMergeV3(const MergeOperationInputV3& merge_in, |
210 | | MergeOperationOutputV3* merge_out) const; |
211 | | |
212 | | // This function performs merge(left_op, right_op) |
213 | | // when both the operands are themselves merge operation types |
214 | | // that you would have passed to a DB::Merge() call in the same order |
215 | | // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). |
216 | | // |
217 | | // PartialMerge should combine them into a single merge operation that is |
218 | | // saved into *new_value, and then it should return true. |
219 | | // *new_value should be constructed such that a call to |
220 | | // DB::Merge(key, *new_value) would yield the same result as a call |
221 | | // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). |
222 | | // |
223 | | // The string that new_value is pointing to will be empty. |
224 | | // |
225 | | // The default implementation of PartialMergeMulti will use this function |
226 | | // as a helper, for backward compatibility. Any successor class of |
227 | | // MergeOperator should either implement PartialMerge or PartialMergeMulti, |
228 | | // although implementing PartialMergeMulti is suggested as it is in general |
229 | | // more effective to merge multiple operands at a time instead of two |
230 | | // operands at a time. |
231 | | // |
232 | | // If it is impossible or infeasible to combine the two operations, |
233 | | // leave new_value unchanged and return false. The library will |
234 | | // internally keep track of the operations, and apply them in the |
235 | | // correct order once a base-value (a Put/Delete/End-of-Database) is seen. |
236 | | // |
237 | | // TODO: Presently there is no way to differentiate between error/corruption |
238 | | // and simply "return false". For now, the client should simply return |
239 | | // false in any case it cannot perform partial-merge, regardless of reason. |
240 | | // If there is corruption in the data, handle it in the FullMergeV3() function |
241 | | // and return false there. The default implementation of PartialMerge will |
242 | | // always return false. |
243 | | virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, |
244 | | const Slice& /*right_operand*/, |
245 | | std::string* /*new_value*/, |
246 | 0 | Logger* /*logger*/) const { |
247 | 0 | return false; |
248 | 0 | } |
249 | | |
250 | | // This function performs merge when all the operands are themselves merge |
251 | | // operation types that you would have passed to a DB::Merge() call in the |
252 | | // same order (front() first) |
253 | | // (i.e. DB::Merge(key, operand_list[0]), followed by |
254 | | // DB::Merge(key, operand_list[1]), ...) |
255 | | // |
256 | | // PartialMergeMulti should combine them into a single merge operation that is |
257 | | // saved into *new_value, and then it should return true. *new_value should |
258 | | // be constructed such that a call to DB::Merge(key, *new_value) would yield |
259 | | // the same result as sequential individual calls to DB::Merge(key, operand) |
260 | | // for each operand in operand_list from front() to back(). |
261 | | // |
262 | | // The string that new_value is pointing to will be empty. |
263 | | // |
264 | | // The PartialMergeMulti function will be called when there are at least two |
265 | | // operands. |
266 | | // |
267 | | // In the default implementation, PartialMergeMulti will invoke PartialMerge |
268 | | // multiple times, where each time it only merges two operands. Developers |
269 | | // should either implement PartialMergeMulti, or implement PartialMerge which |
270 | | // is served as the helper function of the default PartialMergeMulti. |
271 | | virtual bool PartialMergeMulti(const Slice& key, |
272 | | const std::deque<Slice>& operand_list, |
273 | | std::string* new_value, Logger* logger) const; |
274 | | |
275 | | // The name of the MergeOperator. Used to check for MergeOperator |
276 | | // mismatches (i.e., a DB created with one MergeOperator is |
277 | | // accessed using a different MergeOperator) |
278 | | // TODO: the name is currently not stored persistently and thus |
279 | | // no checking is enforced. Client is responsible for providing |
280 | | // consistent MergeOperator between DB opens. |
281 | | const char* Name() const override = 0; |
282 | | |
283 | | // Determines whether the PartialMerge can be called with just a single |
284 | | // merge operand. |
285 | | // Override and return true for allowing a single operand. PartialMerge |
286 | | // and PartialMergeMulti should be overridden and implemented |
287 | | // correctly to properly handle a single operand. |
288 | 0 | virtual bool AllowSingleOperand() const { return false; } |
289 | | |
290 | | // Allows to control when to invoke a full merge during Get. |
291 | | // This could be used to limit the number of merge operands that are looked at |
292 | | // during a point lookup, thereby helping in limiting the number of levels to |
293 | | // read from. |
294 | | // Doesn't help with iterators. |
295 | | // |
296 | | // Note: the merge operands are passed to this function in the reversed order |
297 | | // relative to how they were merged (passed to |
298 | | // FullMerge/FullMergeV2/FullMergeV3) for performance reasons, see also: |
299 | | // https://github.com/facebook/rocksdb/issues/3865 |
300 | 0 | virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const { |
301 | 0 | return false; |
302 | 0 | } |
303 | | }; |
304 | | |
305 | | // The simpler, associative merge operator. |
306 | | class AssociativeMergeOperator : public MergeOperator { |
307 | | public: |
308 | 0 | ~AssociativeMergeOperator() override {} |
309 | | |
310 | | // Gives the client a way to express the read -> modify -> write semantics |
311 | | // key: (IN) The key that's associated with this merge operation. |
312 | | // existing_value:(IN) null indicates the key does not exist before this op |
313 | | // value: (IN) the value to update/merge the existing_value with |
314 | | // new_value: (OUT) Client is responsible for filling the merge result |
315 | | // here. The string that new_value is pointing to will be empty. |
316 | | // logger: (IN) Client could use this to log errors during merge. |
317 | | // |
318 | | // Return true on success. |
319 | | // All values passed in will be client-specific values. So if this method |
320 | | // returns false, it is because client specified bad data or there was |
321 | | // internal corruption. The client should assume that this will be treated |
322 | | // as an error by the library. |
323 | | virtual bool Merge(const Slice& key, const Slice* existing_value, |
324 | | const Slice& value, std::string* new_value, |
325 | | Logger* logger) const = 0; |
326 | | |
327 | | private: |
328 | | // Default implementations of the MergeOperator functions |
329 | | bool FullMergeV2(const MergeOperationInput& merge_in, |
330 | | MergeOperationOutput* merge_out) const override; |
331 | | |
332 | | bool PartialMerge(const Slice& key, const Slice& left_operand, |
333 | | const Slice& right_operand, std::string* new_value, |
334 | | Logger* logger) const override; |
335 | | }; |
336 | | |
337 | | } // namespace ROCKSDB_NAMESPACE |