/src/rocksdb/db/error_handler.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2018-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | #include "db/error_handler.h" |
7 | | |
8 | | #include "db/db_impl/db_impl.h" |
9 | | #include "db/event_helpers.h" |
10 | | #include "file/sst_file_manager_impl.h" |
11 | | #include "logging/logging.h" |
12 | | #include "port/lang.h" |
13 | | |
14 | | namespace ROCKSDB_NAMESPACE { |
15 | | |
16 | | // Maps to help decide the severity of an error based on the |
17 | | // BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks |
18 | | // is set or not. There are 3 maps, going from most specific to least specific |
19 | | // (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and |
20 | | // paranoid_checks). The less specific map serves as a catch all in case we miss |
21 | | // a specific error code or subcode. |
22 | | std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>, |
23 | | Status::Severity> |
24 | | ErrorSeverityMap = { |
25 | | // Errors during BG compaction |
26 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
27 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
28 | | true), |
29 | | Status::Severity::kSoftError}, |
30 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
31 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
32 | | false), |
33 | | Status::Severity::kNoError}, |
34 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
35 | | Status::Code::kIOError, Status::SubCode::kSpaceLimit, |
36 | | true), |
37 | | Status::Severity::kHardError}, |
38 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
39 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
40 | | true), |
41 | | Status::Severity::kFatalError}, |
42 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
43 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
44 | | false), |
45 | | Status::Severity::kFatalError}, |
46 | | // Errors during BG flush |
47 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
48 | | Status::SubCode::kNoSpace, true), |
49 | | Status::Severity::kHardError}, |
50 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
51 | | Status::SubCode::kNoSpace, false), |
52 | | Status::Severity::kNoError}, |
53 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
54 | | Status::SubCode::kSpaceLimit, true), |
55 | | Status::Severity::kHardError}, |
56 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
57 | | Status::SubCode::kIOFenced, true), |
58 | | Status::Severity::kFatalError}, |
59 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
60 | | Status::SubCode::kIOFenced, false), |
61 | | Status::Severity::kFatalError}, |
62 | | // Errors during Write |
63 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
64 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
65 | | true), |
66 | | Status::Severity::kHardError}, |
67 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
68 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
69 | | false), |
70 | | Status::Severity::kHardError}, |
71 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
72 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
73 | | true), |
74 | | Status::Severity::kFatalError}, |
75 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
76 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
77 | | false), |
78 | | Status::Severity::kFatalError}, |
79 | | // Errors during MANIFEST write |
80 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
81 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
82 | | true), |
83 | | Status::Severity::kHardError}, |
84 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
85 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
86 | | false), |
87 | | Status::Severity::kHardError}, |
88 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
89 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
90 | | true), |
91 | | Status::Severity::kFatalError}, |
92 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
93 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
94 | | false), |
95 | | Status::Severity::kFatalError}, |
96 | | // Errors during BG flush with WAL disabled |
97 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
98 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
99 | | true), |
100 | | Status::Severity::kHardError}, |
101 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
102 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
103 | | false), |
104 | | Status::Severity::kNoError}, |
105 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
106 | | Status::Code::kIOError, Status::SubCode::kSpaceLimit, |
107 | | true), |
108 | | Status::Severity::kHardError}, |
109 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
110 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
111 | | true), |
112 | | Status::Severity::kFatalError}, |
113 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
114 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
115 | | false), |
116 | | Status::Severity::kFatalError}, |
117 | | // Errors during MANIFEST write when WAL is disabled |
118 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
119 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
120 | | true), |
121 | | Status::Severity::kHardError}, |
122 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
123 | | Status::Code::kIOError, Status::SubCode::kNoSpace, |
124 | | false), |
125 | | Status::Severity::kHardError}, |
126 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
127 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
128 | | true), |
129 | | Status::Severity::kFatalError}, |
130 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
131 | | Status::Code::kIOError, Status::SubCode::kIOFenced, |
132 | | false), |
133 | | Status::Severity::kFatalError}, |
134 | | |
135 | | }; |
136 | | |
137 | | std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, |
138 | | Status::Severity> |
139 | | DefaultErrorSeverityMap = { |
140 | | // Errors during BG compaction |
141 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
142 | | Status::Code::kCorruption, true), |
143 | | Status::Severity::kUnrecoverableError}, |
144 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
145 | | Status::Code::kCorruption, false), |
146 | | Status::Severity::kNoError}, |
147 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
148 | | Status::Code::kIOError, true), |
149 | | Status::Severity::kFatalError}, |
150 | | {std::make_tuple(BackgroundErrorReason::kCompaction, |
151 | | Status::Code::kIOError, false), |
152 | | Status::Severity::kNoError}, |
153 | | // Errors during BG flush |
154 | | {std::make_tuple(BackgroundErrorReason::kFlush, |
155 | | Status::Code::kCorruption, true), |
156 | | Status::Severity::kUnrecoverableError}, |
157 | | {std::make_tuple(BackgroundErrorReason::kFlush, |
158 | | Status::Code::kCorruption, false), |
159 | | Status::Severity::kNoError}, |
160 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
161 | | true), |
162 | | Status::Severity::kFatalError}, |
163 | | {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, |
164 | | false), |
165 | | Status::Severity::kNoError}, |
166 | | // Errors during Write |
167 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
168 | | Status::Code::kCorruption, true), |
169 | | Status::Severity::kUnrecoverableError}, |
170 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
171 | | Status::Code::kCorruption, false), |
172 | | Status::Severity::kNoError}, |
173 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
174 | | Status::Code::kIOError, true), |
175 | | Status::Severity::kFatalError}, |
176 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, |
177 | | Status::Code::kIOError, false), |
178 | | Status::Severity::kNoError}, |
179 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
180 | | Status::Code::kIOError, true), |
181 | | Status::Severity::kFatalError}, |
182 | | {std::make_tuple(BackgroundErrorReason::kManifestWrite, |
183 | | Status::Code::kIOError, false), |
184 | | Status::Severity::kFatalError}, |
185 | | // Errors during BG flush with WAL disabled |
186 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
187 | | Status::Code::kCorruption, true), |
188 | | Status::Severity::kUnrecoverableError}, |
189 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
190 | | Status::Code::kCorruption, false), |
191 | | Status::Severity::kNoError}, |
192 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
193 | | Status::Code::kIOError, true), |
194 | | Status::Severity::kFatalError}, |
195 | | {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, |
196 | | Status::Code::kIOError, false), |
197 | | Status::Severity::kNoError}, |
198 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
199 | | Status::Code::kIOError, true), |
200 | | Status::Severity::kFatalError}, |
201 | | {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, |
202 | | Status::Code::kIOError, false), |
203 | | Status::Severity::kFatalError}, |
204 | | }; |
205 | | |
206 | | std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity> |
207 | | DefaultReasonMap = { |
208 | | // Errors during BG compaction |
209 | | {std::make_tuple(BackgroundErrorReason::kCompaction, true), |
210 | | Status::Severity::kFatalError}, |
211 | | {std::make_tuple(BackgroundErrorReason::kCompaction, false), |
212 | | Status::Severity::kNoError}, |
213 | | // Errors during BG flush |
214 | | {std::make_tuple(BackgroundErrorReason::kFlush, true), |
215 | | Status::Severity::kFatalError}, |
216 | | {std::make_tuple(BackgroundErrorReason::kFlush, false), |
217 | | Status::Severity::kNoError}, |
218 | | // Errors during Write |
219 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), |
220 | | Status::Severity::kFatalError}, |
221 | | {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), |
222 | | Status::Severity::kFatalError}, |
223 | | // Errors during Memtable update |
224 | | {std::make_tuple(BackgroundErrorReason::kMemTable, true), |
225 | | Status::Severity::kFatalError}, |
226 | | {std::make_tuple(BackgroundErrorReason::kMemTable, false), |
227 | | Status::Severity::kFatalError}, |
228 | | }; |
229 | | |
230 | 11.0k | void ErrorHandler::CancelErrorRecovery() { |
231 | 11.0k | db_mutex_->AssertHeld(); |
232 | | |
233 | | // We'll release the lock before calling sfm, so make sure no new |
234 | | // recovery gets scheduled at that point |
235 | 11.0k | auto_recovery_ = false; |
236 | 11.0k | SstFileManagerImpl* sfm = |
237 | 11.0k | static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); |
238 | 11.0k | if (sfm) { |
239 | | // This may or may not cancel a pending recovery |
240 | 11.0k | db_mutex_->Unlock(); |
241 | 11.0k | bool cancelled = sfm->CancelErrorRecovery(this); |
242 | 11.0k | db_mutex_->Lock(); |
243 | 11.0k | if (cancelled) { |
244 | 0 | recovery_in_prog_ = false; |
245 | 0 | } |
246 | 11.0k | } |
247 | | |
248 | | // If auto recovery is also runing to resume from the retryable error, |
249 | | // we should wait and end the auto recovery. |
250 | 11.0k | EndAutoRecovery(); |
251 | 11.0k | } |
252 | | |
253 | | // This is the main function for looking at an error during a background |
254 | | // operation and deciding the severity, and error recovery strategy. The high |
255 | | // level algorithm is as follows - |
256 | | // 1. Classify the severity of the error based on the ErrorSeverityMap, |
257 | | // DefaultErrorSeverityMap and DefaultReasonMap defined earlier |
258 | | // 2. Call a Status code specific override function to adjust the severity |
259 | | // if needed. The reason for this is our ability to recover may depend on |
260 | | // the exact options enabled in DBOptions |
261 | | // 3. Determine if auto recovery is possible. A listener notification callback |
262 | | // is called, which can disable the auto recovery even if we decide its |
263 | | // feasible |
264 | | // 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control |
265 | | // the actual recovery. If no sst file manager is specified in DBOptions, |
266 | | // a default one is allocated during DB::Open(), so there will always be |
267 | | // one. |
268 | | // This can also get called as part of a recovery operation. In that case, we |
269 | | // also track the error separately in recovery_error_ so we can tell in the |
270 | | // end whether recovery succeeded or not |
271 | | void ErrorHandler::HandleKnownErrors(const Status& bg_err, |
272 | 0 | BackgroundErrorReason reason) { |
273 | 0 | db_mutex_->AssertHeld(); |
274 | 0 | if (bg_err.ok()) { |
275 | 0 | return; |
276 | 0 | } |
277 | | |
278 | 0 | ROCKS_LOG_INFO(db_options_.info_log, |
279 | 0 | "ErrorHandler: Set regular background error\n"); |
280 | |
|
281 | 0 | bool paranoid = db_options_.paranoid_checks; |
282 | 0 | Status::Severity sev = Status::Severity::kFatalError; |
283 | 0 | Status new_bg_err; |
284 | 0 | DBRecoverContext context; |
285 | 0 | bool found = false; |
286 | |
|
287 | 0 | { |
288 | 0 | auto entry = ErrorSeverityMap.find( |
289 | 0 | std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid)); |
290 | 0 | if (entry != ErrorSeverityMap.end()) { |
291 | 0 | sev = entry->second; |
292 | 0 | found = true; |
293 | 0 | } |
294 | 0 | } |
295 | |
|
296 | 0 | if (!found) { |
297 | 0 | auto entry = DefaultErrorSeverityMap.find( |
298 | 0 | std::make_tuple(reason, bg_err.code(), paranoid)); |
299 | 0 | if (entry != DefaultErrorSeverityMap.end()) { |
300 | 0 | sev = entry->second; |
301 | 0 | found = true; |
302 | 0 | } |
303 | 0 | } |
304 | |
|
305 | 0 | if (!found) { |
306 | 0 | auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid)); |
307 | 0 | if (entry != DefaultReasonMap.end()) { |
308 | 0 | sev = entry->second; |
309 | 0 | } |
310 | 0 | } |
311 | |
|
312 | 0 | new_bg_err = Status(bg_err, sev); |
313 | | |
314 | | // Check if recovery is currently in progress. If it is, we will save this |
315 | | // error so we can check it at the end to see if recovery succeeded or not |
316 | 0 | if (recovery_in_prog_ && recovery_error_.ok()) { |
317 | 0 | recovery_error_ = status_to_io_status(Status(new_bg_err)); |
318 | 0 | } |
319 | |
|
320 | 0 | bool auto_recovery = auto_recovery_; |
321 | 0 | if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) { |
322 | 0 | auto_recovery = false; |
323 | 0 | } |
324 | | |
325 | | // Allow some error specific overrides |
326 | 0 | if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || |
327 | 0 | new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { |
328 | 0 | new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); |
329 | 0 | } |
330 | |
|
331 | 0 | if (!new_bg_err.ok()) { |
332 | 0 | Status s = new_bg_err; |
333 | 0 | EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, |
334 | 0 | db_mutex_, &auto_recovery); |
335 | 0 | if (!s.ok() && (s.severity() > bg_error_.severity())) { |
336 | 0 | bg_error_ = s; |
337 | 0 | } else { |
338 | | // This error is less severe than previously encountered error. Don't |
339 | | // take any further action |
340 | 0 | return; |
341 | 0 | } |
342 | 0 | } |
343 | | |
344 | 0 | recover_context_ = context; |
345 | 0 | if (auto_recovery) { |
346 | 0 | recovery_in_prog_ = true; |
347 | | |
348 | | // Kick-off error specific recovery |
349 | 0 | if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || |
350 | 0 | new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { |
351 | 0 | RecoverFromNoSpace(); |
352 | 0 | } |
353 | 0 | } |
354 | 0 | if (bg_error_.severity() >= Status::Severity::kHardError) { |
355 | 0 | is_db_stopped_.store(true, std::memory_order_release); |
356 | 0 | } |
357 | 0 | } |
358 | | |
359 | | // This is the main function for looking at IO related error during the |
360 | | // background operations. The main logic is: |
361 | | // File scope IO error is treated as retryable IO error in the write path. In |
362 | | // RocksDB, If a file has write IO error and it is at file scope, RocksDB never |
363 | | // write to the same file again. RocksDB will create a new file and rewrite the |
364 | | // whole content. Thus, it is retryable. |
365 | | // There are three main categories of error handling: |
366 | | // 1) if the error is caused by data loss, the error is mapped to |
367 | | // unrecoverable error. Application/user must take action to handle |
368 | | // this situation (File scope case is excluded). |
369 | | // 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, |
370 | | // or its retryable flag is set and not a data loss error), auto resume ( |
371 | | // DBImpl::ResumeImpl) may be called and the auto resume can be controlled |
372 | | // by resume count and resume interval options. There are three sub-cases: |
373 | | // a) if the error happens during compaction, it is mapped to a soft error. |
374 | | // the compaction thread will reschedule a new compaction. This doesn't |
375 | | // call auto resume. |
376 | | // b) if the error happens during flush and also WAL is empty, it is mapped |
377 | | // to a soft error. Note that, it includes the case that IO error happens |
378 | | // in SST or manifest write during flush. Auto resume will be called. |
379 | | // c) all other errors are mapped to hard error. Auto resume will be called. |
380 | | // 3) for other cases, HandleKnownErrors(const Status& bg_err, |
381 | | // BackgroundErrorReason reason) will be called to handle other error cases |
382 | | // such as delegating to SstFileManager to handle no space error. |
383 | | void ErrorHandler::SetBGError(const Status& bg_status, |
384 | 0 | BackgroundErrorReason reason) { |
385 | 0 | db_mutex_->AssertHeld(); |
386 | 0 | Status tmp_status = bg_status; |
387 | 0 | IOStatus bg_io_err = status_to_io_status(std::move(tmp_status)); |
388 | |
|
389 | 0 | if (bg_io_err.ok()) { |
390 | 0 | return; |
391 | 0 | } |
392 | 0 | ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", |
393 | 0 | bg_io_err.ToString().c_str()); |
394 | |
|
395 | 0 | RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT}, |
396 | 0 | {} /* int_histograms */); |
397 | |
|
398 | 0 | Status new_bg_io_err = bg_io_err; |
399 | 0 | DBRecoverContext context; |
400 | 0 | if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && |
401 | 0 | bg_io_err.GetDataLoss()) { |
402 | | // First, data loss (non file scope) is treated as unrecoverable error. So |
403 | | // it can directly overwrite any existing bg_error_. |
404 | 0 | bool auto_recovery = false; |
405 | 0 | Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); |
406 | 0 | CheckAndSetRecoveryAndBGError(bg_err); |
407 | 0 | ROCKS_LOG_INFO( |
408 | 0 | db_options_.info_log, |
409 | 0 | "ErrorHandler: Set background IO error as unrecoverable error\n"); |
410 | 0 | EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, |
411 | 0 | &bg_err, db_mutex_, &auto_recovery); |
412 | 0 | recover_context_ = context; |
413 | 0 | return; |
414 | 0 | } |
415 | 0 | if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && |
416 | 0 | (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile || |
417 | 0 | bg_io_err.GetRetryable())) { |
418 | | // Second, check if the error is a retryable IO error (file scope IO error |
419 | | // is also treated as retryable IO error in RocksDB write path). if it is |
420 | | // retryable error and its severity is higher than bg_error_, overwrite the |
421 | | // bg_error_ with new error. In current stage, for retryable IO error of |
422 | | // compaction, treat it as soft error. In other cases, treat the retryable |
423 | | // IO error as hard error. Note that, all the NoSpace error should be |
424 | | // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter |
425 | | // it is retryable or file scope, this logic will be bypassed. |
426 | |
|
427 | 0 | RecordStats({ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT}, |
428 | 0 | {} /* int_histograms */); |
429 | 0 | ROCKS_LOG_INFO(db_options_.info_log, |
430 | 0 | "ErrorHandler: Set background retryable IO error\n"); |
431 | 0 | if (BackgroundErrorReason::kCompaction == reason) { |
432 | | // We map the retryable IO error during compaction to soft error. Since |
433 | | // compaction can reschedule by itself. We will not set the BG error in |
434 | | // this case |
435 | | // TODO: a better way to set or clean the retryable IO error which |
436 | | // happens during compaction SST file write. |
437 | 0 | RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */); |
438 | 0 | ROCKS_LOG_INFO( |
439 | 0 | db_options_.info_log, |
440 | 0 | "ErrorHandler: Compaction will schedule by itself to resume\n"); |
441 | 0 | bool auto_recovery = false; |
442 | 0 | EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, |
443 | 0 | &new_bg_io_err, db_mutex_, |
444 | 0 | &auto_recovery); |
445 | | // Not used in this code path. |
446 | 0 | new_bg_io_err.PermitUncheckedError(); |
447 | 0 | return; |
448 | 0 | } |
449 | | |
450 | 0 | Status::Severity severity; |
451 | 0 | if (BackgroundErrorReason::kFlushNoWAL == reason || |
452 | 0 | BackgroundErrorReason::kManifestWriteNoWAL == reason) { |
453 | | // When the BG Retryable IO error reason is flush without WAL, |
454 | | // We map it to a soft error. At the same time, all the background work |
455 | | // should be stopped except the BG work from recovery. Therefore, we |
456 | | // set the soft_error_no_bg_work_ to true. At the same time, since DB |
457 | | // continues to receive writes when BG error is soft error, to avoid |
458 | | // to many small memtable being generated during auto resume, the flush |
459 | | // reason is set to kErrorRecoveryRetryFlush. |
460 | 0 | severity = Status::Severity::kSoftError; |
461 | 0 | soft_error_no_bg_work_ = true; |
462 | 0 | context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; |
463 | 0 | } else { |
464 | 0 | severity = Status::Severity::kHardError; |
465 | 0 | } |
466 | 0 | Status bg_err(new_bg_io_err, severity); |
467 | 0 | CheckAndSetRecoveryAndBGError(bg_err); |
468 | 0 | recover_context_ = context; |
469 | 0 | bool auto_recovery = db_options_.max_bgerror_resume_count > 0; |
470 | 0 | EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, |
471 | 0 | &new_bg_io_err, db_mutex_, |
472 | 0 | &auto_recovery); |
473 | 0 | StartRecoverFromRetryableBGIOError(bg_io_err); |
474 | 0 | return; |
475 | 0 | } |
476 | 0 | HandleKnownErrors(new_bg_io_err, reason); |
477 | 0 | } |
478 | | |
479 | | void ErrorHandler::AddFilesToQuarantine( |
480 | 0 | autovector<const autovector<uint64_t>*> files_to_quarantine) { |
481 | 0 | db_mutex_->AssertHeld(); |
482 | 0 | std::ostringstream quarantine_files_oss; |
483 | 0 | bool is_first_one = true; |
484 | 0 | for (const auto* files : files_to_quarantine) { |
485 | 0 | assert(files); |
486 | 0 | for (uint64_t file_number : *files) { |
487 | 0 | files_to_quarantine_.push_back(file_number); |
488 | 0 | quarantine_files_oss << (is_first_one ? "" : ", ") << file_number; |
489 | 0 | is_first_one = false; |
490 | 0 | } |
491 | 0 | } |
492 | 0 | ROCKS_LOG_INFO(db_options_.info_log, |
493 | 0 | "ErrorHandler: added file numbers %s to quarantine.\n", |
494 | 0 | quarantine_files_oss.str().c_str()); |
495 | 0 | } |
496 | | |
497 | 0 | void ErrorHandler::ClearFilesToQuarantine() { |
498 | 0 | db_mutex_->AssertHeld(); |
499 | 0 | files_to_quarantine_.clear(); |
500 | 0 | ROCKS_LOG_INFO(db_options_.info_log, |
501 | 0 | "ErrorHandler: cleared files in quarantine.\n"); |
502 | 0 | } |
503 | | |
504 | | Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, |
505 | 0 | bool* auto_recovery) { |
506 | 0 | if (bg_error.severity() >= Status::Severity::kFatalError) { |
507 | 0 | return bg_error; |
508 | 0 | } |
509 | | |
510 | 0 | if (db_options_.sst_file_manager.get() == nullptr) { |
511 | | // We rely on SFM to poll for enough disk space and recover |
512 | 0 | *auto_recovery = false; |
513 | 0 | return bg_error; |
514 | 0 | } |
515 | | |
516 | 0 | if (db_options_.allow_2pc && |
517 | 0 | (bg_error.severity() <= Status::Severity::kSoftError)) { |
518 | | // Don't know how to recover, as the contents of the current WAL file may |
519 | | // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled, |
520 | | // we can just flush the memtable and discard the log |
521 | 0 | *auto_recovery = false; |
522 | 0 | return Status(bg_error, Status::Severity::kFatalError); |
523 | 0 | } |
524 | | |
525 | 0 | { |
526 | 0 | uint64_t free_space; |
527 | 0 | if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, |
528 | 0 | &free_space) == Status::NotSupported()) { |
529 | 0 | *auto_recovery = false; |
530 | 0 | } |
531 | 0 | } |
532 | |
|
533 | 0 | return bg_error; |
534 | 0 | } |
535 | | |
536 | 0 | void ErrorHandler::RecoverFromNoSpace() { |
537 | 0 | SstFileManagerImpl* sfm = |
538 | 0 | static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); |
539 | | |
540 | | // Inform SFM of the error, so it can kick-off the recovery |
541 | 0 | if (sfm) { |
542 | 0 | sfm->StartErrorRecovery(this, bg_error_); |
543 | 0 | } |
544 | 0 | } |
545 | | |
546 | 0 | Status ErrorHandler::ClearBGError() { |
547 | 0 | db_mutex_->AssertHeld(); |
548 | | |
549 | | // Signal that recovery succeeded |
550 | 0 | if (recovery_error_.ok()) { |
551 | 0 | assert(files_to_quarantine_.empty()); |
552 | 0 | Status old_bg_error = bg_error_; |
553 | | // old_bg_error is only for notifying listeners, so may not be checked |
554 | 0 | old_bg_error.PermitUncheckedError(); |
555 | | // Clear and check the recovery IO and BG error |
556 | 0 | is_db_stopped_.store(false, std::memory_order_release); |
557 | 0 | bg_error_ = Status::OK(); |
558 | 0 | recovery_error_ = IOStatus::OK(); |
559 | 0 | bg_error_.PermitUncheckedError(); |
560 | 0 | recovery_error_.PermitUncheckedError(); |
561 | 0 | recovery_in_prog_ = false; |
562 | 0 | soft_error_no_bg_work_ = false; |
563 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, |
564 | 0 | bg_error_, db_mutex_); |
565 | 0 | } |
566 | 0 | return recovery_error_; |
567 | 0 | } |
568 | | |
569 | 0 | Status ErrorHandler::RecoverFromBGError(bool is_manual) { |
570 | 0 | InstrumentedMutexLock l(db_mutex_); |
571 | 0 | bool no_bg_work_original_flag = soft_error_no_bg_work_; |
572 | 0 | if (is_manual) { |
573 | | // If its a manual recovery and there's a background recovery in progress |
574 | | // return busy status |
575 | 0 | if (recovery_in_prog_) { |
576 | 0 | return Status::Busy("Recovery already in progress"); |
577 | 0 | } |
578 | 0 | recovery_in_prog_ = true; |
579 | | |
580 | | // In manual resume, we allow the bg work to run. If it is a auto resume, |
581 | | // the bg work should follow this tag. |
582 | 0 | soft_error_no_bg_work_ = false; |
583 | | |
584 | | // In manual resume, if the bg error is a soft error and also requires |
585 | | // no bg work, the error must be recovered by call the flush with |
586 | | // flush reason: kErrorRecoveryRetryFlush. In other case, the flush |
587 | | // reason is set to kErrorRecovery. |
588 | 0 | if (no_bg_work_original_flag) { |
589 | 0 | recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush; |
590 | 0 | } else { |
591 | 0 | recover_context_.flush_reason = FlushReason::kErrorRecovery; |
592 | 0 | } |
593 | 0 | } |
594 | | |
595 | 0 | if (bg_error_.severity() == Status::Severity::kSoftError && |
596 | 0 | recover_context_.flush_reason == FlushReason::kErrorRecovery) { |
597 | | // Simply clear the background error and return |
598 | 0 | recovery_error_ = IOStatus::OK(); |
599 | 0 | return ClearBGError(); |
600 | 0 | } |
601 | | |
602 | | // Reset recovery_error_. We will use this to record any errors that happen |
603 | | // during the recovery process. While recovering, the only operations that |
604 | | // can generate background errors should be the flush operations |
605 | 0 | recovery_error_ = IOStatus::OK(); |
606 | 0 | recovery_error_.PermitUncheckedError(); |
607 | 0 | Status s = db_->ResumeImpl(recover_context_); |
608 | 0 | if (s.ok()) { |
609 | 0 | soft_error_no_bg_work_ = false; |
610 | 0 | } else { |
611 | 0 | soft_error_no_bg_work_ = no_bg_work_original_flag; |
612 | 0 | } |
613 | | |
614 | | // For manual recover, shutdown, and fatal error cases, set |
615 | | // recovery_in_prog_ to false. For automatic background recovery, leave it |
616 | | // as is regardless of success or failure as it will be retried |
617 | 0 | if (is_manual || s.IsShutdownInProgress() || |
618 | 0 | bg_error_.severity() >= Status::Severity::kFatalError) { |
619 | 0 | recovery_in_prog_ = false; |
620 | 0 | } |
621 | 0 | return s; |
622 | 0 | } |
623 | | |
624 | | void ErrorHandler::StartRecoverFromRetryableBGIOError( |
625 | 0 | const IOStatus& io_error) { |
626 | 0 | db_mutex_->AssertHeld(); |
627 | 0 | if (bg_error_.ok() || io_error.ok()) { |
628 | 0 | return; |
629 | 0 | } |
630 | 0 | if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { |
631 | | // Auto resume BG error is not enabled |
632 | 0 | return; |
633 | 0 | } |
634 | 0 | if (end_recovery_) { |
635 | | // Can temporarily release db mutex |
636 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, |
637 | 0 | Status::ShutdownInProgress(), |
638 | 0 | db_mutex_); |
639 | 0 | db_mutex_->AssertHeld(); |
640 | 0 | return; |
641 | 0 | } |
642 | 0 | RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */); |
643 | 0 | ROCKS_LOG_INFO( |
644 | 0 | db_options_.info_log, |
645 | 0 | "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); |
646 | | // Needs to be set in the same lock hold as setting BG error, otherwise |
647 | | // intervening writes could see a BG error without a recovery and bail out. |
648 | 0 | recovery_in_prog_ = true; |
649 | |
|
650 | 0 | if (recovery_thread_) { |
651 | | // Ensure only one thread can execute the join(). |
652 | 0 | std::unique_ptr<port::Thread> old_recovery_thread( |
653 | 0 | std::move(recovery_thread_)); |
654 | | // In this case, if recovery_in_prog_ is false, current thread should |
655 | | // wait the previous recover thread to finish and create a new thread |
656 | | // to recover from the bg error. |
657 | 0 | db_mutex_->Unlock(); |
658 | 0 | TEST_SYNC_POINT( |
659 | 0 | "StartRecoverFromRetryableBGIOError:BeforeWaitingForOtherThread"); |
660 | 0 | old_recovery_thread->join(); |
661 | 0 | TEST_SYNC_POINT( |
662 | 0 | "StartRecoverFromRetryableBGIOError:AfterWaitingForOtherThread"); |
663 | 0 | db_mutex_->Lock(); |
664 | 0 | } |
665 | |
|
666 | 0 | recovery_thread_.reset( |
667 | 0 | new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); |
668 | 0 | } |
669 | | |
670 | | // Automatic recover from Retryable BG IO error. Must be called after db |
671 | | // mutex is released. |
672 | 0 | void ErrorHandler::RecoverFromRetryableBGIOError() { |
673 | 0 | assert(recovery_in_prog_); |
674 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); |
675 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2"); |
676 | 0 | InstrumentedMutexLock l(db_mutex_); |
677 | 0 | if (end_recovery_) { |
678 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, |
679 | 0 | Status::ShutdownInProgress(), |
680 | 0 | db_mutex_); |
681 | |
|
682 | 0 | recovery_in_prog_ = false; |
683 | 0 | return; |
684 | 0 | } |
685 | 0 | DBRecoverContext context = recover_context_; |
686 | 0 | context.flush_after_recovery = true; |
687 | 0 | int resume_count = db_options_.max_bgerror_resume_count; |
688 | 0 | uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; |
689 | 0 | uint64_t retry_count = 0; |
690 | | // Recover from the retryable error. Create a separate thread to do it. |
691 | 0 | while (resume_count > 0) { |
692 | 0 | if (end_recovery_) { |
693 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, |
694 | 0 | Status::ShutdownInProgress(), |
695 | 0 | db_mutex_); |
696 | 0 | recovery_in_prog_ = false; |
697 | 0 | return; |
698 | 0 | } |
699 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); |
700 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); |
701 | 0 | recovery_error_ = IOStatus::OK(); |
702 | 0 | retry_count++; |
703 | 0 | Status s = db_->ResumeImpl(context); |
704 | 0 | RecordStats({ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT}, |
705 | 0 | {} /* int_histograms */); |
706 | 0 | if (s.IsShutdownInProgress() || |
707 | 0 | bg_error_.severity() >= Status::Severity::kFatalError) { |
708 | | // If DB shutdown in progress or the error severity is higher than |
709 | | // Hard Error, stop auto resume and returns. |
710 | 0 | recovery_in_prog_ = false; |
711 | 0 | RecordStats({} /* ticker_types */, |
712 | 0 | {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); |
713 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, |
714 | 0 | bg_error_, db_mutex_); |
715 | 0 | return; |
716 | 0 | } |
717 | 0 | if (!recovery_error_.ok() && |
718 | 0 | recovery_error_.severity() <= Status::Severity::kHardError && |
719 | 0 | recovery_error_.GetRetryable()) { |
720 | | // If new BG IO error happens during auto recovery and it is retryable |
721 | | // and its severity is Hard Error or lower, the auto resmue sleep for |
722 | | // a period of time and redo auto resume if it is allowed. |
723 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0"); |
724 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); |
725 | 0 | int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; |
726 | 0 | cv_.TimedWait(wait_until); |
727 | 0 | } else { |
728 | | // There are three possibility: 1) recovery_error_ is set during resume |
729 | | // and the error is not retryable, 2) recover is successful, 3) other |
730 | | // error happens during resume and cannot be resumed here. |
731 | 0 | if (recovery_error_.ok() && s.ok()) { |
732 | | // recover from the retryable IO error and no other BG errors. Clean |
733 | | // the bg_error and notify user. |
734 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); |
735 | 0 | RecordStats({ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT}, |
736 | 0 | {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); |
737 | 0 | return; |
738 | 0 | } else { |
739 | | // In this case: 1) recovery_error_ is more serious or not retryable |
740 | | // 2) other error happens. The auto recovery stops. |
741 | 0 | recovery_in_prog_ = false; |
742 | 0 | RecordStats({} /* ticker_types */, |
743 | 0 | {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); |
744 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd( |
745 | 0 | db_options_.listeners, bg_error_, |
746 | 0 | !recovery_error_.ok() ? recovery_error_ : s, db_mutex_); |
747 | 0 | return; |
748 | 0 | } |
749 | 0 | } |
750 | 0 | resume_count--; |
751 | 0 | } |
752 | 0 | recovery_in_prog_ = false; |
753 | 0 | EventHelpers::NotifyOnErrorRecoveryEnd( |
754 | 0 | db_options_.listeners, bg_error_, |
755 | 0 | Status::Aborted("Exceeded resume retry count"), db_mutex_); |
756 | 0 | TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); |
757 | 0 | RecordStats({} /* ticker_types */, |
758 | 0 | {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); |
759 | 0 | } |
760 | | |
761 | 0 | void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { |
762 | 0 | if (recovery_in_prog_ && recovery_error_.ok()) { |
763 | 0 | recovery_error_ = status_to_io_status(Status(bg_err)); |
764 | 0 | } |
765 | 0 | if (bg_err.severity() > bg_error_.severity()) { |
766 | 0 | bg_error_ = bg_err; |
767 | 0 | } |
768 | 0 | if (bg_error_.severity() >= Status::Severity::kHardError) { |
769 | 0 | is_db_stopped_.store(true, std::memory_order_release); |
770 | 0 | } |
771 | 0 | } |
772 | | |
773 | 11.0k | void ErrorHandler::EndAutoRecovery() { |
774 | 11.0k | db_mutex_->AssertHeld(); |
775 | 11.0k | if (!end_recovery_) { |
776 | 11.0k | end_recovery_ = true; |
777 | 11.0k | } |
778 | 11.0k | if (recovery_thread_) { |
779 | | // Ensure only one thread can execute the join(). |
780 | 0 | std::unique_ptr<port::Thread> old_recovery_thread( |
781 | 0 | std::move(recovery_thread_)); |
782 | 0 | db_mutex_->Unlock(); |
783 | 0 | cv_.SignalAll(); |
784 | 0 | old_recovery_thread->join(); |
785 | 0 | db_mutex_->Lock(); |
786 | 0 | } |
787 | 11.0k | TEST_SYNC_POINT("PostEndAutoRecovery"); |
788 | 11.0k | } |
789 | | |
790 | | void ErrorHandler::RecordStats( |
791 | | const std::vector<Tickers>& ticker_types, |
792 | 0 | const std::vector<std::tuple<Histograms, uint64_t>>& int_histograms) { |
793 | 0 | if (bg_error_stats_ == nullptr) { |
794 | 0 | return; |
795 | 0 | } |
796 | 0 | for (const auto& ticker_type : ticker_types) { |
797 | 0 | RecordTick(bg_error_stats_.get(), ticker_type); |
798 | 0 | } |
799 | |
|
800 | 0 | for (const auto& hist : int_histograms) { |
801 | 0 | RecordInHistogram(bg_error_stats_.get(), std::get<0>(hist), |
802 | 0 | std::get<1>(hist)); |
803 | 0 | } |
804 | 0 | } |
805 | | |
806 | | } // namespace ROCKSDB_NAMESPACE |