Coverage Report

Created: 2025-12-31 06:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/pacemaker/lib/common/watchdog.c
Line
Count
Source
1
/*
2
 * Copyright 2013-2025 the Pacemaker project contributors
3
 *
4
 * The version control history for this file may have further details.
5
 *
6
 * This source code is licensed under the GNU Lesser General Public License
7
 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8
 */
9
10
#include <crm_internal.h>
11
12
#include <sched.h>
13
#include <stdbool.h>
14
#include <sys/ioctl.h>
15
#include <sys/reboot.h>
16
17
#include <sys/types.h>
18
#include <sys/stat.h>
19
#include <unistd.h>
20
#include <ctype.h>
21
#include <dirent.h>
22
#include <signal.h>
23
24
#include <glib.h>           // g_str_has_prefix()
25
#include <qb/qbdefs.h>      // QB_MIN(), QB_MAX()
26
27
static pid_t sbd_pid = 0;
28
29
/*!
30
 * \internal
31
 * \brief Tell pacemakerd to panic the local host
32
 *
33
 * \param[in] ppid  Process ID of parent process
34
 */
35
static void
36
panic_local_nonroot(pid_t ppid)
37
0
{
38
0
    if (ppid > 1) { // pacemakerd is still our parent
39
0
        pcmk__emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]",
40
0
                    (long long) ppid);
41
0
    } else { // Signal (non-parent) pacemakerd if possible
42
0
        ppid = pcmk__procfs_pid_of(PCMK__SERVER_PACEMAKERD);
43
0
        if (ppid > 0) {
44
0
            union sigval signal_value;
45
46
0
            pcmk__emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic",
47
0
                        (long long) ppid);
48
0
            memset(&signal_value, 0, sizeof(signal_value));
49
0
            if (sigqueue(ppid, SIGQUIT, signal_value) < 0) {
50
0
                pcmk__emerg("Exiting after signal failure: %s",
51
0
                            strerror(errno));
52
0
            }
53
0
        } else {
54
0
            pcmk__emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD
55
0
                        "process");
56
0
        }
57
0
    }
58
0
    crm_exit(CRM_EX_PANIC);
59
0
}
60
61
/*!
62
 * \internal
63
 * \brief Panic the local host (if root) or tell pacemakerd to do so
64
 */
65
static void
66
panic_local(void)
67
0
{
68
0
    const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
69
0
    const char *panic_action = full_panic_action;
70
0
    int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot
71
72
0
    if (geteuid() != 0) { // Non-root caller such as the controller
73
0
        panic_local_nonroot(getppid());
74
0
        return;
75
0
    }
76
77
0
    if ((full_panic_action != NULL)
78
0
        && g_str_has_prefix(full_panic_action, "sync-")) {
79
80
0
        panic_action += sizeof("sync-") - 1;
81
0
        sync();
82
0
    }
83
84
0
    if (pcmk__str_empty(full_panic_action)
85
0
        || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) {
86
0
        pcmk__sysrq_trigger('b');
87
88
0
    } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) {
89
0
        pcmk__sysrq_trigger('c');
90
91
0
    } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) {
92
0
        pcmk__sysrq_trigger('o');
93
0
#ifdef RB_POWER_OFF
94
0
        reboot_cmd = RB_POWER_OFF;
95
#elif defined(RB_POWEROFF)
96
        reboot_cmd = RB_POWEROFF;
97
#endif
98
0
    } else {
99
0
        pcmk__warn("Using default '" PCMK_VALUE_REBOOT "' for local option "
100
0
                   "PCMK_" PCMK__ENV_PANIC_ACTION " because '%s' is not a "
101
0
                   "valid value",
102
0
                   full_panic_action);
103
0
        pcmk__sysrq_trigger('b');
104
0
    }
105
106
    // sysrq failed or is not supported on this platform, so fall back to reboot
107
0
    reboot(reboot_cmd);
108
109
    // Even reboot failed, nothing left to do but exit
110
0
    pcmk__emerg("Exiting after reboot failed: %s", strerror(errno));
111
0
    if (getppid() > 1) { // pacemakerd is parent process
112
0
        crm_exit(CRM_EX_PANIC);
113
0
    } else { // This is pacemakerd, or an orphaned subdaemon
114
0
        crm_exit(CRM_EX_FATAL);
115
0
    }
116
0
}
117
118
/*!
119
 * \internal
120
 * \brief Tell sbd to kill the local host, then exit
121
 */
122
static void
123
panic_sbd(void)
124
0
{
125
0
    union sigval signal_value;
126
0
    pid_t ppid = getppid();
127
128
0
    memset(&signal_value, 0, sizeof(signal_value));
129
    /* TODO: Arrange for a slightly less brutal option? */
130
0
    if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
131
0
        pcmk__emerg("Panicking directly because couldn't signal sbd");
132
0
        panic_local();
133
0
    }
134
135
0
    if(ppid > 1) {
136
        /* child daemon */
137
0
        crm_exit(CRM_EX_PANIC);
138
0
    } else {
139
        /* pacemakerd or orphan child */
140
0
        crm_exit(CRM_EX_FATAL);
141
0
    }
142
0
}
143
144
/*!
145
 * \internal
146
 * \brief Panic the local host
147
 *
148
 * Panic the local host either by sbd (if running), directly, or by asking
149
 * pacemakerd. If trace logging this function, exit instead.
150
 *
151
 * \param[in] reason  Why panic is needed (for logging only)
152
 */
153
void
154
pcmk__panic(const char *reason)
155
0
{
156
0
    if (pcmk__locate_sbd() > 1) {
157
0
        pcmk__emerg("Signaling sbd[%lld] to panic the system: %s",
158
0
                    (long long) sbd_pid, reason);
159
0
        panic_sbd();
160
161
0
    } else {
162
0
        pcmk__emerg("Panicking the system directly: %s", reason);
163
0
        panic_local();
164
0
    }
165
0
}
166
167
0
#define PIDFILE PCMK__RUN_DIR "/sbd.pid"
168
169
/*!
170
 * \internal
171
 * \brief Return the process ID of sbd (or 0 if it is not running)
172
 */
173
pid_t
174
pcmk__locate_sbd(void)
175
0
{
176
0
    gchar *contents = NULL;
177
0
    long long pid_read = 0;
178
179
0
    if (sbd_pid > 1) {
180
0
        return sbd_pid;
181
0
    }
182
183
0
    if (g_file_get_contents(PIDFILE, &contents, NULL, NULL)
184
0
        && (pcmk__scan_ll(contents, &pid_read, 0) == pcmk_rc_ok)
185
0
        && (pcmk__pid_active((pid_t) pid_read,
186
0
                             SBIN_DIR "/sbd") != ESRCH)) {
187
188
        /* If the pcmk__pid_active() return code is neither pcmk__rc_ok nor
189
         * ESRCH, then we couldn't determine whether the PID belongs to
190
         * SBIN_DIR "/sbd". In that case, we assume that it does.
191
         *
192
         * @TODO Make sure that's what we want to do.
193
         */
194
0
        pcmk__trace("SBD detected at pid %lld (via PID file " PIDFILE ")",
195
0
                    pid_read);
196
0
        sbd_pid = (pid_t) pid_read;
197
198
0
    } else {
199
0
        unlink(PIDFILE);
200
201
        // Fall back to /proc for systems that support it
202
0
        sbd_pid = pcmk__procfs_pid_of("sbd");
203
0
        if (sbd_pid != 0) {
204
0
            pcmk__trace("SBD detected at pid %lld (via procfs)",
205
0
                        (long long) sbd_pid);
206
0
        }
207
0
    }
208
209
0
    if (sbd_pid <= 0) {
210
0
        sbd_pid = 0;
211
0
        pcmk__trace("SBD not detected");
212
0
    }
213
0
    g_free(contents);
214
0
    return sbd_pid;
215
0
}
216
217
// 0 <= return value <= LONG_MAX
218
long
219
pcmk__get_sbd_watchdog_timeout(void)
220
0
{
221
0
    static long sbd_timeout = -1;
222
223
0
    if (sbd_timeout == -1) {
224
0
        const char *timeout = getenv("SBD_WATCHDOG_TIMEOUT");
225
0
        long long timeout_ms = 0;
226
227
0
        if ((timeout != NULL)
228
0
            && (pcmk__parse_ms(timeout, &timeout_ms) == pcmk_rc_ok)
229
0
            && (timeout_ms >= 0)) {
230
231
0
            sbd_timeout = (long) QB_MIN(timeout_ms, LONG_MAX);
232
233
0
        } else {
234
0
            sbd_timeout = 0;
235
0
        }
236
0
    }
237
0
    return sbd_timeout;
238
0
}
239
240
bool
241
pcmk__get_sbd_sync_resource_startup(void)
242
0
{
243
0
    static bool sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
244
0
    static bool checked_sync_resource_startup = false;
245
246
0
    if (!checked_sync_resource_startup) {
247
0
        const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
248
249
0
        if (sync_env == NULL) {
250
0
            pcmk__trace("Defaulting to %sstart-up synchronization with sbd",
251
0
                        (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
252
253
0
        } else if (pcmk__parse_bool(sync_env,
254
0
                                    &sync_resource_startup) != pcmk_rc_ok) {
255
0
            pcmk__warn("Defaulting to %sstart-up synchronization with sbd "
256
0
                       "because environment value '%s' is invalid",
257
0
                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
258
0
        }
259
0
        checked_sync_resource_startup = true;
260
0
    }
261
0
    return sync_resource_startup;
262
0
}
263
264
// 0 <= return value <= min(LONG_MAX, (2 * SBD timeout))
265
long
266
pcmk__auto_fencing_watchdog_timeout(void)
267
0
{
268
0
    long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
269
0
    long long st_timeout = 2 * (long long) sbd_timeout;
270
271
0
    return (long) QB_MIN(st_timeout, LONG_MAX);
272
0
}
273
274
bool
275
pcmk__valid_fencing_watchdog_timeout(const char *value)
276
0
{
277
    /* @COMPAT At a compatibility break, accept either negative values or a
278
     * specific string like "auto" (but not both) to mean "auto-calculate the
279
     * timeout." Reject other values that aren't parsable as timeouts.
280
     */
281
0
    long long st_timeout = 0;
282
283
0
    if ((value != NULL) && (pcmk__parse_ms(value, &st_timeout) == pcmk_rc_ok)) {
284
0
        st_timeout = QB_MIN(st_timeout, LONG_MAX);
285
0
    }
286
287
0
    if (st_timeout < 0) {
288
0
        st_timeout = pcmk__auto_fencing_watchdog_timeout();
289
290
        // At this point, 0 <= sbd_timeout <= st_timeout
291
0
        pcmk__debug("Using calculated value %lld for "
292
0
                    PCMK_OPT_FENCING_WATCHDOG_TIMEOUT " (%s)",
293
0
                    st_timeout, value);
294
0
    }
295
296
0
    if (st_timeout == 0) {
297
0
        pcmk__debug("Watchdog may be enabled but "
298
0
                    PCMK_OPT_FENCING_WATCHDOG_TIMEOUT " is disabled (%s)",
299
0
                    pcmk__s(value, "default"));
300
301
0
    } else if (pcmk__locate_sbd() == 0) {
302
0
        pcmk__emerg("Shutting down: " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT
303
0
                    " configured (%s) but SBD not active",
304
0
                    pcmk__s(value, "auto"));
305
0
        crm_exit(CRM_EX_FATAL);
306
0
        return false;
307
308
0
    } else {
309
0
        long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
310
311
0
        if (st_timeout < sbd_timeout) {
312
            /* Passed-in value for PCMK_OPT_FENCING_WATCHDOG_TIMEOUT was
313
             * parsable, positive, and less than the SBD_WATCHDOG_TIMEOUT
314
             */
315
0
            pcmk__emerg("Shutting down: " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT
316
0
                        " (%s) too short (must be >%ldms)",
317
0
                        value, sbd_timeout);
318
0
            crm_exit(CRM_EX_FATAL);
319
0
            return false;
320
0
        }
321
0
        pcmk__info("Watchdog configured with " PCMK_OPT_FENCING_WATCHDOG_TIMEOUT
322
0
                   " %s and SBD timeout %ldms",
323
0
                   value, sbd_timeout);
324
0
    }
325
0
    return true;
326
0
}