1
#include "source/common/signal/fatal_error_handler.h"
2

            
3
#include <atomic>
4
#include <list>
5

            
6
#include "envoy/event/dispatcher.h"
7

            
8
#include "source/common/common/assert.h"
9
#include "source/common/common/macros.h"
10
#include "source/common/signal/fatal_action.h"
11

            
12
#include "absl/base/attributes.h"
13
#include "absl/synchronization/mutex.h"
14

            
15
namespace Envoy {
16
namespace FatalErrorHandler {
17

            
18
namespace {
19

            
20
// The type of Fatal Actions.
21
enum class FatalActionType {
22
  Safe,
23
  Unsafe,
24
};
25

            
26
ABSL_CONST_INIT static absl::Mutex failure_mutex(absl::kConstInit);
27
// Since we can't grab the failure mutex on fatal error (snagging locks under
28
// fatal crash causing potential deadlocks) access the handler list as an atomic
29
// operation, which is async-signal-safe. If the crash handler runs at the same
30
// time as another thread tries to modify the list, one of them will get the
31
// list and the other will get nullptr instead. If the crash handler loses the
32
// race and gets nullptr, it won't run any of the registered error handlers.
33
using FailureFunctionList = std::list<const FatalErrorHandlerInterface*>;
34
ABSL_CONST_INIT std::atomic<FailureFunctionList*> fatal_error_handlers{nullptr};
35

            
36
// Use an atomic operation since on fatal error we'll consume the
37
// fatal_action_manager and don't want to have any locks as they aren't
38
// async-signal-safe.
39
ABSL_CONST_INIT std::atomic<FatalAction::FatalActionManager*> fatal_action_manager{nullptr};
40
ABSL_CONST_INIT std::atomic<int64_t> failure_tid{-1};
41

            
42
// Executes the Fatal Actions provided.
43
5
void runFatalActionsInternal(const FatalAction::FatalActionPtrList& actions) {
44
  // Exchange the fatal_error_handlers pointer so other functions cannot
45
  // concurrently access the list.
46
5
  FailureFunctionList* list = fatal_error_handlers.exchange(nullptr);
47
5
  if (list == nullptr) {
48
    return;
49
  }
50

            
51
  // Get the dispatcher and its tracked object.
52
5
  for (auto* handler : *list) {
53
5
    handler->runFatalActionsOnTrackedObject(actions);
54
5
  }
55

            
56
  // Restore the fatal_error_handlers pointer so subsequent calls using the list
57
  // can succeed.
58
5
  fatal_error_handlers.store(list);
59
5
}
60

            
61
// Helper function to run exclusively either safe or unsafe actions depending on
62
// the provided action_type.
63
// Returns a FatalAction status corresponding to our attempt to run the
64
// action_type.
65
11
FatalAction::Status runFatalActions(FatalActionType action_type) {
66
  // Check that registerFatalActions has already been called.
67
11
  FatalAction::FatalActionManager* action_manager = fatal_action_manager.load();
68

            
69
11
  if (action_manager == nullptr) {
70
3
    return FatalAction::Status::ActionManagerUnset;
71
3
  }
72

            
73
8
  int64_t my_tid = action_manager->getThreadFactory().currentThreadId().getId();
74

            
75
8
  if (action_type == FatalActionType::Safe) {
76
    // Try to run safe actions
77
5
    int64_t expected_tid = -1;
78

            
79
5
    if (failure_tid.compare_exchange_strong(expected_tid, my_tid)) {
80
      // Run the actions
81
3
      runFatalActionsInternal(action_manager->getSafeActions());
82
3
      return FatalAction::Status::Success;
83
3
    } else if (expected_tid == my_tid) {
84
1
      return FatalAction::Status::AlreadyRanOnThisThread;
85
1
    }
86

            
87
5
  } else {
88
    // Try to run unsafe actions
89
3
    int64_t failing_tid = failure_tid.load();
90

            
91
3
    ASSERT(failing_tid != -1);
92

            
93
3
    if (my_tid == failing_tid) {
94
2
      runFatalActionsInternal(action_manager->getUnsafeActions());
95
2
      return FatalAction::Status::Success;
96
2
    }
97
3
  }
98

            
99
2
  return FatalAction::Status::RunningOnAnotherThread;
100
8
}
101

            
102
} // namespace
103

            
104
77815
void registerFatalErrorHandler(const FatalErrorHandlerInterface& handler) {
105
77815
#ifdef ENVOY_OBJECT_TRACE_ON_DUMP
106
77815
  absl::MutexLock l(failure_mutex);
107
77815
  FailureFunctionList* list = fatal_error_handlers.exchange(nullptr);
108
77815
  if (list == nullptr) {
109
13485
    list = new FailureFunctionList;
110
13485
  }
111
77815
  list->push_back(&handler);
112
  // Store the fatal_error_handlers pointer now that the list is updated.
113
77815
  fatal_error_handlers.store(list);
114
#else
115
  UNREFERENCED_PARAMETER(handler);
116
#endif
117
77815
}
118

            
119
77814
void removeFatalErrorHandler(const FatalErrorHandlerInterface& handler) {
120
77814
#ifdef ENVOY_OBJECT_TRACE_ON_DUMP
121
77814
  absl::MutexLock l(failure_mutex);
122
77814
  FailureFunctionList* list = fatal_error_handlers.exchange(nullptr);
123
77814
  if (list == nullptr) {
124
    // removeFatalErrorHandler() may see an empty list of fatal error handlers
125
    // if it's called at the same time as callFatalErrorHandlers(). In that case
126
    // Envoy is in the middle of crashing anyway, but don't add a segfault on
127
    // top of the crash.
128
    return;
129
  }
130
77814
  list->remove(&handler);
131
77814
  if (list->empty()) {
132
13484
    delete list;
133
65850
  } else {
134
64330
    fatal_error_handlers.store(list);
135
64330
  }
136
#else
137
  UNREFERENCED_PARAMETER(handler);
138
#endif
139
77814
}
140

            
141
3
void callFatalErrorHandlers(std::ostream& os) {
142
3
  FailureFunctionList* list = fatal_error_handlers.exchange(nullptr);
143
3
  if (list != nullptr) {
144
2
    for (const auto* handler : *list) {
145
2
      handler->onFatalError(os);
146
2
    }
147

            
148
2
    fatal_error_handlers.store(list);
149
2
  }
150
3
}
151

            
152
void registerFatalActions(FatalAction::FatalActionPtrList safe_actions,
153
                          FatalAction::FatalActionPtrList unsafe_actions,
154
10674
                          Thread::ThreadFactory& thread_factory) {
155
  // Create a FatalActionManager and store it.
156
10674
  if (!fatal_action_manager) {
157
10672
    fatal_action_manager.exchange(new FatalAction::FatalActionManager(
158
10672
        std::move(safe_actions), std::move(unsafe_actions), thread_factory));
159
10672
  }
160
10674
}
161

            
162
7
FatalAction::Status runSafeActions() { return runFatalActions(FatalActionType::Safe); }
163

            
164
4
FatalAction::Status runUnsafeActions() { return runFatalActions(FatalActionType::Unsafe); }
165

            
166
10691
void clearFatalActionsOnTerminate() {
167
10691
  auto* raw_ptr = fatal_action_manager.exchange(nullptr);
168
10691
  if (raw_ptr != nullptr) {
169
10667
    delete raw_ptr;
170
10667
  }
171
10691
}
172

            
173
// This resets the internal state of Fatal Action for the module.
174
// This is necessary as it allows us to have multiple test cases invoke the
175
// fatal actions without state from other tests leaking in.
176
6
void resetFatalActionStateForTest() {
177
  // Free the memory of the Fatal Action, since it's not managed by a smart
178
  // pointer. This prevents memory leaks in tests.
179
6
  auto* raw_ptr = fatal_action_manager.exchange(nullptr);
180
6
  if (raw_ptr != nullptr) {
181
5
    delete raw_ptr;
182
5
  }
183
6
  failure_tid.store(-1);
184
6
}
185

            
186
} // namespace FatalErrorHandler
187
} // namespace Envoy