/src/pacemaker/lib/common/watchdog.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright 2013-2025 the Pacemaker project contributors |
3 | | * |
4 | | * The version control history for this file may have further details. |
5 | | * |
6 | | * This source code is licensed under the GNU Lesser General Public License |
7 | | * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. |
8 | | */ |
9 | | |
10 | | #include <crm_internal.h> |
11 | | |
12 | | #include <sched.h> |
13 | | #include <stdbool.h> |
14 | | #include <sys/ioctl.h> |
15 | | #include <sys/reboot.h> |
16 | | |
17 | | #include <sys/types.h> |
18 | | #include <sys/stat.h> |
19 | | #include <unistd.h> |
20 | | #include <ctype.h> |
21 | | #include <dirent.h> |
22 | | #include <signal.h> |
23 | | |
24 | | #include <glib.h> // g_str_has_prefix() |
25 | | #include <qb/qbdefs.h> // QB_MIN(), QB_MAX() |
26 | | |
27 | | static pid_t sbd_pid = 0; |
28 | | |
29 | | /*! |
30 | | * \internal |
31 | | * \brief Tell pacemakerd to panic the local host |
32 | | * |
33 | | * \param[in] ppid Process ID of parent process |
34 | | */ |
35 | | static void |
36 | | panic_local_nonroot(pid_t ppid) |
37 | 0 | { |
38 | 0 | if (ppid > 1) { // pacemakerd is still our parent |
39 | 0 | pcmk__emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]", |
40 | 0 | (long long) ppid); |
41 | 0 | } else { // Signal (non-parent) pacemakerd if possible |
42 | 0 | ppid = pcmk__procfs_pid_of(PCMK__SERVER_PACEMAKERD); |
43 | 0 | if (ppid > 0) { |
44 | 0 | union sigval signal_value; |
45 | |
|
46 | 0 | pcmk__emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic", |
47 | 0 | (long long) ppid); |
48 | 0 | memset(&signal_value, 0, sizeof(signal_value)); |
49 | 0 | if (sigqueue(ppid, SIGQUIT, signal_value) < 0) { |
50 | 0 | pcmk__emerg("Exiting after signal failure: %s", |
51 | 0 | strerror(errno)); |
52 | 0 | } |
53 | 0 | } else { |
54 | 0 | pcmk__emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD |
55 | 0 | "process"); |
56 | 0 | } |
57 | 0 | } |
58 | 0 | crm_exit(CRM_EX_PANIC); |
59 | 0 | } |
60 | | |
61 | | /*! |
62 | | * \internal |
63 | | * \brief Panic the local host (if root) or tell pacemakerd to do so |
64 | | */ |
65 | | static void |
66 | | panic_local(void) |
67 | 0 | { |
68 | 0 | const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION); |
69 | 0 | const char *panic_action = full_panic_action; |
70 | 0 | int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot |
71 | |
|
72 | 0 | if (geteuid() != 0) { // Non-root caller such as the controller |
73 | 0 | panic_local_nonroot(getppid()); |
74 | 0 | return; |
75 | 0 | } |
76 | | |
77 | 0 | if ((full_panic_action != NULL) |
78 | 0 | && g_str_has_prefix(full_panic_action, "sync-")) { |
79 | |
|
80 | 0 | panic_action += sizeof("sync-") - 1; |
81 | 0 | sync(); |
82 | 0 | } |
83 | |
|
84 | 0 | if (pcmk__str_empty(full_panic_action) |
85 | 0 | || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) { |
86 | 0 | pcmk__sysrq_trigger('b'); |
87 | |
|
88 | 0 | } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) { |
89 | 0 | pcmk__sysrq_trigger('c'); |
90 | |
|
91 | 0 | } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) { |
92 | 0 | pcmk__sysrq_trigger('o'); |
93 | 0 | #ifdef RB_POWER_OFF |
94 | 0 | reboot_cmd = RB_POWER_OFF; |
95 | | #elif defined(RB_POWEROFF) |
96 | | reboot_cmd = RB_POWEROFF; |
97 | | #endif |
98 | 0 | } else { |
99 | 0 | pcmk__warn("Using default '" PCMK_VALUE_REBOOT "' for local option " |
100 | 0 | "PCMK_" PCMK__ENV_PANIC_ACTION " because '%s' is not a " |
101 | 0 | "valid value", |
102 | 0 | full_panic_action); |
103 | 0 | pcmk__sysrq_trigger('b'); |
104 | 0 | } |
105 | | |
106 | | // sysrq failed or is not supported on this platform, so fall back to reboot |
107 | 0 | reboot(reboot_cmd); |
108 | | |
109 | | // Even reboot failed, nothing left to do but exit |
110 | 0 | pcmk__emerg("Exiting after reboot failed: %s", strerror(errno)); |
111 | 0 | if (getppid() > 1) { // pacemakerd is parent process |
112 | 0 | crm_exit(CRM_EX_PANIC); |
113 | 0 | } else { // This is pacemakerd, or an orphaned subdaemon |
114 | 0 | crm_exit(CRM_EX_FATAL); |
115 | 0 | } |
116 | 0 | } |
117 | | |
118 | | /*! |
119 | | * \internal |
120 | | * \brief Tell sbd to kill the local host, then exit |
121 | | */ |
122 | | static void |
123 | | panic_sbd(void) |
124 | 0 | { |
125 | 0 | union sigval signal_value; |
126 | 0 | pid_t ppid = getppid(); |
127 | |
|
128 | 0 | memset(&signal_value, 0, sizeof(signal_value)); |
129 | | /* TODO: Arrange for a slightly less brutal option? */ |
130 | 0 | if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) { |
131 | 0 | pcmk__emerg("Panicking directly because couldn't signal sbd"); |
132 | 0 | panic_local(); |
133 | 0 | } |
134 | |
|
135 | 0 | if(ppid > 1) { |
136 | | /* child daemon */ |
137 | 0 | crm_exit(CRM_EX_PANIC); |
138 | 0 | } else { |
139 | | /* pacemakerd or orphan child */ |
140 | 0 | crm_exit(CRM_EX_FATAL); |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | | /*! |
145 | | * \internal |
146 | | * \brief Panic the local host |
147 | | * |
148 | | * Panic the local host either by sbd (if running), directly, or by asking |
149 | | * pacemakerd. If trace logging this function, exit instead. |
150 | | * |
151 | | * \param[in] reason Why panic is needed (for logging only) |
152 | | */ |
153 | | void |
154 | | pcmk__panic(const char *reason) |
155 | 0 | { |
156 | 0 | if (pcmk__locate_sbd() > 1) { |
157 | 0 | pcmk__emerg("Signaling sbd[%lld] to panic the system: %s", |
158 | 0 | (long long) sbd_pid, reason); |
159 | 0 | panic_sbd(); |
160 | |
|
161 | 0 | } else { |
162 | 0 | pcmk__emerg("Panicking the system directly: %s", reason); |
163 | 0 | panic_local(); |
164 | 0 | } |
165 | 0 | } |
166 | | |
167 | 0 | #define PIDFILE PCMK__RUN_DIR "/sbd.pid" |
168 | | |
169 | | /*! |
170 | | * \internal |
171 | | * \brief Return the process ID of sbd (or 0 if it is not running) |
172 | | */ |
173 | | pid_t |
174 | | pcmk__locate_sbd(void) |
175 | 0 | { |
176 | 0 | gchar *contents = NULL; |
177 | 0 | long long pid_read = 0; |
178 | |
|
179 | 0 | if (sbd_pid > 1) { |
180 | 0 | return sbd_pid; |
181 | 0 | } |
182 | | |
183 | 0 | if (g_file_get_contents(PIDFILE, &contents, NULL, NULL) |
184 | 0 | && (pcmk__scan_ll(contents, &pid_read, 0) == pcmk_rc_ok) |
185 | 0 | && (pcmk__pid_active((pid_t) pid_read, |
186 | 0 | SBIN_DIR "/sbd") != ESRCH)) { |
187 | | |
188 | | /* If the pcmk__pid_active() return code is neither pcmk__rc_ok nor |
189 | | * ESRCH, then we couldn't determine whether the PID belongs to |
190 | | * SBIN_DIR "/sbd". In that case, we assume that it does. |
191 | | * |
192 | | * @TODO Make sure that's what we want to do. |
193 | | */ |
194 | 0 | pcmk__trace("SBD detected at pid %lld (via PID file " PIDFILE ")", |
195 | 0 | pid_read); |
196 | 0 | sbd_pid = (pid_t) pid_read; |
197 | |
|
198 | 0 | } else { |
199 | 0 | unlink(PIDFILE); |
200 | | |
201 | | // Fall back to /proc for systems that support it |
202 | 0 | sbd_pid = pcmk__procfs_pid_of("sbd"); |
203 | 0 | if (sbd_pid != 0) { |
204 | 0 | pcmk__trace("SBD detected at pid %lld (via procfs)", |
205 | 0 | (long long) sbd_pid); |
206 | 0 | } |
207 | 0 | } |
208 | | |
209 | 0 | if (sbd_pid <= 0) { |
210 | 0 | sbd_pid = 0; |
211 | 0 | pcmk__trace("SBD not detected"); |
212 | 0 | } |
213 | 0 | g_free(contents); |
214 | 0 | return sbd_pid; |
215 | 0 | } |
216 | | |
217 | | // 0 <= return value <= LONG_MAX |
218 | | long |
219 | | pcmk__get_sbd_watchdog_timeout(void) |
220 | 0 | { |
221 | 0 | static long sbd_timeout = -1; |
222 | |
|
223 | 0 | if (sbd_timeout == -1) { |
224 | 0 | const char *timeout = getenv("SBD_WATCHDOG_TIMEOUT"); |
225 | 0 | long long timeout_ms = 0; |
226 | |
|
227 | 0 | if ((timeout != NULL) |
228 | 0 | && (pcmk__parse_ms(timeout, &timeout_ms) == pcmk_rc_ok) |
229 | 0 | && (timeout_ms >= 0)) { |
230 | |
|
231 | 0 | sbd_timeout = (long) QB_MIN(timeout_ms, LONG_MAX); |
232 | |
|
233 | 0 | } else { |
234 | 0 | sbd_timeout = 0; |
235 | 0 | } |
236 | 0 | } |
237 | 0 | return sbd_timeout; |
238 | 0 | } |
239 | | |
240 | | bool |
241 | | pcmk__get_sbd_sync_resource_startup(void) |
242 | 0 | { |
243 | 0 | static bool sync_resource_startup = PCMK__SBD_SYNC_DEFAULT; |
244 | 0 | static bool checked_sync_resource_startup = false; |
245 | |
|
246 | 0 | if (!checked_sync_resource_startup) { |
247 | 0 | const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP"); |
248 | |
|
249 | 0 | if (sync_env == NULL) { |
250 | 0 | pcmk__trace("Defaulting to %sstart-up synchronization with sbd", |
251 | 0 | (PCMK__SBD_SYNC_DEFAULT? "" : "no ")); |
252 | |
|
253 | 0 | } else if (pcmk__parse_bool(sync_env, |
254 | 0 | &sync_resource_startup) != pcmk_rc_ok) { |
255 | 0 | pcmk__warn("Defaulting to %sstart-up synchronization with sbd " |
256 | 0 | "because environment value '%s' is invalid", |
257 | 0 | (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env); |
258 | 0 | } |
259 | 0 | checked_sync_resource_startup = true; |
260 | 0 | } |
261 | 0 | return sync_resource_startup; |
262 | 0 | } |
263 | | |
264 | | // 0 <= return value <= min(LONG_MAX, (2 * SBD timeout)) |
265 | | long |
266 | | pcmk__auto_fencing_watchdog_timeout(void) |
267 | 0 | { |
268 | 0 | long sbd_timeout = pcmk__get_sbd_watchdog_timeout(); |
269 | 0 | long long st_timeout = 2 * (long long) sbd_timeout; |
270 | |
|
271 | 0 | return (long) QB_MIN(st_timeout, LONG_MAX); |
272 | 0 | } |
273 | | |
274 | | bool |
275 | | pcmk__valid_fencing_watchdog_timeout(const char *value) |
276 | 0 | { |
277 | | /* @COMPAT At a compatibility break, accept either negative values or a |
278 | | * specific string like "auto" (but not both) to mean "auto-calculate the |
279 | | * timeout." Reject other values that aren't parsable as timeouts. |
280 | | */ |
281 | 0 | long long st_timeout = 0; |
282 | |
|
283 | 0 | if ((value != NULL) && (pcmk__parse_ms(value, &st_timeout) == pcmk_rc_ok)) { |
284 | 0 | st_timeout = QB_MIN(st_timeout, LONG_MAX); |
285 | 0 | } |
286 | |
|
287 | 0 | if (st_timeout < 0) { |
288 | 0 | st_timeout = pcmk__auto_fencing_watchdog_timeout(); |
289 | | |
290 | | // At this point, 0 <= sbd_timeout <= st_timeout |
291 | 0 | pcmk__debug("Using calculated value %lld for " |
292 | 0 | PCMK_OPT_FENCING_WATCHDOG_TIMEOUT " (%s)", |
293 | 0 | st_timeout, value); |
294 | 0 | } |
295 | | |
296 | 0 | if (st_timeout == 0) { |
297 | 0 | pcmk__debug("Watchdog may be enabled but " |
298 | 0 | PCMK_OPT_FENCING_WATCHDOG_TIMEOUT " is disabled (%s)", |
299 | 0 | pcmk__s(value, "default")); |
300 | |
|
301 | 0 | } else if (pcmk__locate_sbd() == 0) { |
302 | 0 | pcmk__emerg("Shutting down: " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT |
303 | 0 | " configured (%s) but SBD not active", |
304 | 0 | pcmk__s(value, "auto")); |
305 | 0 | crm_exit(CRM_EX_FATAL); |
306 | 0 | return false; |
307 | |
|
308 | 0 | } else { |
309 | 0 | long sbd_timeout = pcmk__get_sbd_watchdog_timeout(); |
310 | |
|
311 | 0 | if (st_timeout < sbd_timeout) { |
312 | | /* Passed-in value for PCMK_OPT_FENCING_WATCHDOG_TIMEOUT was |
313 | | * parsable, positive, and less than the SBD_WATCHDOG_TIMEOUT |
314 | | */ |
315 | 0 | pcmk__emerg("Shutting down: " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT |
316 | 0 | " (%s) too short (must be >%ldms)", |
317 | 0 | value, sbd_timeout); |
318 | 0 | crm_exit(CRM_EX_FATAL); |
319 | 0 | return false; |
320 | 0 | } |
321 | 0 | pcmk__info("Watchdog configured with " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT |
322 | 0 | " %s and SBD timeout %ldms", |
323 | 0 | value, sbd_timeout); |
324 | 0 | } |
325 | 0 | return true; |
326 | 0 | } |