Coverage Report

Created: 2026-04-01 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/perfetto/src/base/watchdog_posix.cc
Line
Count
Source
1
/*
2
 * Copyright (C) 2018 The Android Open Source Project
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *      http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include "perfetto/ext/base/platform.h"
18
#include "perfetto/ext/base/watchdog.h"
19
20
#if PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)
21
22
#include <fcntl.h>
23
#include <poll.h>
24
#include <signal.h>
25
#include <stdint.h>
26
#include <stdlib.h>
27
#include <sys/syscall.h>
28
#include <sys/timerfd.h>
29
#include <unistd.h>
30
31
#include <algorithm>
32
#include <cinttypes>
33
#include <fstream>
34
#include <thread>
35
36
#include "perfetto/base/build_config.h"
37
#include "perfetto/base/logging.h"
38
#include "perfetto/base/thread_utils.h"
39
#include "perfetto/base/time.h"
40
#include "perfetto/ext/base/crash_keys.h"
41
#include "perfetto/ext/base/file_utils.h"
42
#include "perfetto/ext/base/scoped_file.h"
43
#include "perfetto/ext/base/utils.h"
44
45
namespace perfetto {
46
namespace base {
47
48
namespace {
49
50
constexpr uint32_t kDefaultPollingInterval = 30 * 1000;
51
52
base::CrashKey g_crash_key_reason("wdog_reason");
53
54
836
bool IsMultipleOf(uint32_t number, uint32_t divisor) {
55
836
  return number >= divisor && number % divisor == 0;
56
836
}
57
58
0
double MeanForArray(const uint64_t array[], size_t size) {
59
0
  uint64_t total = 0;
60
0
  for (size_t i = 0; i < size; i++) {
61
0
    total += array[i];
62
0
  }
63
0
  return static_cast<double>(total / size);
64
0
}
65
66
}  //  namespace
67
68
0
bool ReadProcStat(int fd, ProcStat* out) {
69
0
  char c[512];
70
0
  size_t c_pos = 0;
71
0
  while (c_pos < sizeof(c) - 1) {
72
0
    ssize_t rd = PERFETTO_EINTR(read(fd, c + c_pos, sizeof(c) - c_pos));
73
0
    if (rd < 0) {
74
0
      PERFETTO_ELOG("Failed to read stat file to enforce resource limits.");
75
0
      return false;
76
0
    }
77
0
    if (rd == 0)
78
0
      break;
79
0
    c_pos += static_cast<size_t>(rd);
80
0
  }
81
0
  PERFETTO_CHECK(c_pos < sizeof(c));
82
0
  c[c_pos] = '\0';
83
84
0
  if (sscanf(c,
85
0
             "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu "
86
0
             "%lu %*d %*d %*d %*d %*d %*d %*u %*u %ld",
87
0
             &out->utime, &out->stime, &out->rss_pages) != 3) {
88
0
    PERFETTO_ELOG("Invalid stat format: %s", c);
89
0
    return false;
90
0
  }
91
0
  return true;
92
0
}
93
94
Watchdog::Watchdog(uint32_t polling_interval_ms)
95
2
    : polling_interval_ms_(polling_interval_ms) {}
96
97
0
Watchdog::~Watchdog() {
98
0
  if (!thread_.joinable()) {
99
0
    PERFETTO_DCHECK(!enabled_);
100
0
    return;
101
0
  }
102
0
  PERFETTO_DCHECK(enabled_);
103
0
  enabled_ = false;
104
105
  // Rearm the timer to 1ns from now. This will cause the watchdog thread to
106
  // wakeup from the poll() and see |enabled_| == false.
107
  // This code path is used only in tests. In production code the watchdog is
108
  // a singleton and is never destroyed.
109
0
  struct itimerspec ts {};
110
0
  ts.it_value.tv_sec = 0;
111
0
  ts.it_value.tv_nsec = 1;
112
0
  timerfd_settime(*timer_fd_, /*flags=*/0, &ts, nullptr);
113
114
0
  thread_.join();
115
0
}
116
117
18.8k
Watchdog* Watchdog::GetInstance() {
118
18.8k
  static Watchdog* watchdog = new Watchdog(kDefaultPollingInterval);
119
18.8k
  return watchdog;
120
18.8k
}
121
122
// Can be called from any thread.
123
Watchdog::Timer Watchdog::CreateFatalTimer(uint32_t ms,
124
17.9k
                                           WatchdogCrashReason crash_reason) {
125
17.9k
  if (!enabled_.load(std::memory_order_relaxed))
126
17.9k
    return Watchdog::Timer(this, 0, crash_reason);
127
128
0
  return Watchdog::Timer(this, ms, crash_reason);
129
17.9k
}
130
131
// Can be called from any thread.
132
0
void Watchdog::AddFatalTimer(TimerData timer) {
133
0
  std::lock_guard<std::mutex> guard(mutex_);
134
0
  timers_.emplace_back(std::move(timer));
135
0
  RearmTimerFd_Locked();
136
0
}
137
138
// Can be called from any thread.
139
0
void Watchdog::RemoveFatalTimer(TimerData timer) {
140
0
  std::lock_guard<std::mutex> guard(mutex_);
141
0
  for (auto it = timers_.begin(); it != timers_.end(); it++) {
142
0
    if (*it == timer) {
143
0
      timers_.erase(it);
144
0
      break;  // Remove only one. Doesn't matter which one.
145
0
    }
146
0
  }
147
0
  RearmTimerFd_Locked();
148
0
}
149
150
0
void Watchdog::RearmTimerFd_Locked() {
151
0
  if (!enabled_)
152
0
    return;
153
0
  auto it = std::min_element(timers_.begin(), timers_.end());
154
155
  // We use one timerfd to handle all the oustanding |timers_|. Keep it armed
156
  // to the task expiring soonest.
157
0
  struct itimerspec ts {};
158
0
  if (it != timers_.end()) {
159
0
    ts.it_value = ToPosixTimespec(it->deadline);
160
0
  }
161
  // If |timers_| is empty (it == end()) |ts.it_value| will remain
162
  // zero-initialized and that will disarm the timer in the call below.
163
0
  int res = timerfd_settime(*timer_fd_, TFD_TIMER_ABSTIME, &ts, nullptr);
164
0
  PERFETTO_DCHECK(res == 0);
165
0
}
166
167
0
void Watchdog::Start() {
168
0
  std::lock_guard<std::mutex> guard(mutex_);
169
0
  if (thread_.joinable()) {
170
0
    PERFETTO_DCHECK(enabled_);
171
0
  } else {
172
0
    PERFETTO_DCHECK(!enabled_);
173
174
0
#if PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \
175
0
    PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
176
    // Kick the thread to start running but only on Android or Linux.
177
0
    timer_fd_.reset(
178
0
        timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK));
179
0
    if (!timer_fd_) {
180
0
      PERFETTO_PLOG(
181
0
          "timerfd_create failed, the Perfetto watchdog is not available");
182
0
      return;
183
0
    }
184
0
    enabled_ = true;
185
0
    RearmTimerFd_Locked();  // Deal with timers created before Start().
186
0
    thread_ = std::thread(&Watchdog::ThreadMain, this);
187
0
#endif
188
0
  }
189
0
}
190
191
836
void Watchdog::SetMemoryLimit(uint64_t bytes, uint32_t window_ms) {
192
  // Update the fields under the lock.
193
836
  std::lock_guard<std::mutex> guard(mutex_);
194
195
836
  PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) || bytes == 0);
196
197
836
  size_t size = bytes == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
198
836
  memory_window_bytes_.Reset(size);
199
836
  memory_limit_bytes_ = bytes;
200
836
}
201
202
0
void Watchdog::SetCpuLimit(uint32_t percentage, uint32_t window_ms) {
203
0
  std::lock_guard<std::mutex> guard(mutex_);
204
205
0
  PERFETTO_CHECK(percentage <= 100);
206
0
  PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) ||
207
0
                 percentage == 0);
208
209
0
  size_t size = percentage == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
210
0
  cpu_window_time_ticks_.Reset(size);
211
0
  cpu_limit_percentage_ = percentage;
212
0
}
213
214
0
void Watchdog::ThreadMain() {
215
  // Register crash keys explicitly to avoid running out of slots at crash time.
216
0
  g_crash_key_reason.Register();
217
218
0
  base::ScopedFile stat_fd(base::OpenFile("/proc/self/stat", O_RDONLY));
219
0
  if (!stat_fd) {
220
0
    PERFETTO_ELOG("Failed to open stat file to enforce resource limits.");
221
0
    return;
222
0
  }
223
224
0
  PERFETTO_DCHECK(timer_fd_);
225
226
0
  constexpr uint8_t kFdCount = 1;
227
0
  struct pollfd fds[kFdCount]{};
228
0
  fds[0].fd = *timer_fd_;
229
0
  fds[0].events = POLLIN;
230
231
0
  for (;;) {
232
    // We use the poll() timeout to drive the periodic ticks for the cpu/memory
233
    // checks. The only other case when the poll() unblocks is when we crash
234
    // (or have to quit via enabled_ == false, but that happens only in tests).
235
0
    platform::BeforeMaybeBlockingSyscall();
236
0
    auto ret = poll(fds, kFdCount, static_cast<int>(polling_interval_ms_));
237
0
    platform::AfterMaybeBlockingSyscall();
238
0
    if (!enabled_)
239
0
      return;
240
0
    if (ret < 0) {
241
0
      if (errno == ENOMEM || errno == EINTR) {
242
        // Should happen extremely rarely.
243
0
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
244
0
        continue;
245
0
      }
246
0
      PERFETTO_FATAL("watchdog poll() failed");
247
0
    }
248
249
    // If we get here either:
250
    // 1. poll() timed out, in which case we should process cpu/mem guardrails.
251
    // 2. A timer expired, in which case we shall crash.
252
253
0
    uint64_t expired = 0;  // Must be exactly 8 bytes.
254
0
    auto res = PERFETTO_EINTR(read(*timer_fd_, &expired, sizeof(expired)));
255
0
    PERFETTO_DCHECK((res < 0 && (errno == EAGAIN)) ||
256
0
                    (res == sizeof(expired) && expired > 0));
257
0
    const auto now = GetWallTimeMs();
258
259
    // Check if any of the timers expired.
260
0
    int tid_to_kill = 0;
261
0
    WatchdogCrashReason crash_reason{};
262
0
    {
263
0
      std::lock_guard<std::mutex> guard(mutex_);
264
0
      for (const auto& timer : timers_) {
265
0
        if (now >= timer.deadline) {
266
0
          tid_to_kill = timer.thread_id;
267
0
          crash_reason = timer.crash_reason;
268
0
          break;
269
0
        }
270
0
      }
271
0
    }
272
273
0
    if (tid_to_kill)
274
0
      SerializeLogsAndKillThread(tid_to_kill, crash_reason);
275
276
    // Check CPU and memory guardrails (if enabled).
277
0
    lseek(stat_fd.get(), 0, SEEK_SET);
278
0
    ProcStat stat;
279
0
    if (!ReadProcStat(stat_fd.get(), &stat))
280
0
      continue;
281
0
    uint64_t cpu_time = stat.utime + stat.stime;
282
0
    uint64_t rss_bytes =
283
0
        static_cast<uint64_t>(stat.rss_pages) * base::GetSysPageSize();
284
285
0
    bool threshold_exceeded = false;
286
0
    {
287
0
      std::lock_guard<std::mutex> guard(mutex_);
288
0
      if (CheckMemory_Locked(rss_bytes) && !IsSyncMemoryTaggingEnabled()) {
289
0
        threshold_exceeded = true;
290
0
        crash_reason = WatchdogCrashReason::kMemGuardrail;
291
0
      } else if (CheckCpu_Locked(cpu_time)) {
292
0
        threshold_exceeded = true;
293
0
        crash_reason = WatchdogCrashReason::kCpuGuardrail;
294
0
      }
295
0
    }
296
297
0
    if (threshold_exceeded)
298
0
      SerializeLogsAndKillThread(getpid(), crash_reason);
299
0
  }
300
0
}
301
302
void Watchdog::SerializeLogsAndKillThread(int tid,
303
0
                                          WatchdogCrashReason crash_reason) {
304
0
  g_crash_key_reason.Set(static_cast<int>(crash_reason));
305
306
  // We are about to die. Serialize the logs into the crash buffer so the
307
  // debuggerd crash handler picks them up and attaches to the bugreport.
308
  // In the case of a PERFETTO_CHECK/PERFETTO_FATAL this is done in logging.h.
309
  // But in the watchdog case, we don't hit that codepath and must do ourselves.
310
0
  MaybeSerializeLastLogsForCrashReporting();
311
312
  // Send a SIGABRT to the thread that armed the timer. This is to see the
313
  // callstack of the thread that is stuck in a long task rather than the
314
  // watchdog thread.
315
0
  if (syscall(__NR_tgkill, getpid(), tid, SIGABRT) < 0) {
316
    // At this point the process must die. If for any reason the tgkill doesn't
317
    // work (e.g. the thread has disappeared), force a crash from here.
318
0
    abort();
319
0
  }
320
321
0
  if (disable_kill_failsafe_for_testing_)
322
0
    return;
323
324
  // The tgkill() above will take some milliseconds to cause a crash, as it
325
  // involves the kernel to queue the SIGABRT on the target thread (often the
326
  // main thread, which is != watchdog thread) and do a scheduling round.
327
  // If something goes wrong though (the target thread has signals masked or
328
  // is stuck in an uninterruptible+wakekill syscall) force quit from this
329
  // thread.
330
0
  std::this_thread::sleep_for(std::chrono::seconds(10));
331
0
  abort();
332
0
}
333
334
0
bool Watchdog::CheckMemory_Locked(uint64_t rss_bytes) {
335
0
  if (memory_limit_bytes_ == 0)
336
0
    return false;
337
338
  // Add the current stat value to the ring buffer and check that the mean
339
  // remains under our threshold.
340
0
  if (memory_window_bytes_.Push(rss_bytes)) {
341
0
    if (memory_window_bytes_.Mean() >
342
0
        static_cast<double>(memory_limit_bytes_)) {
343
0
      PERFETTO_ELOG(
344
0
          "Memory watchdog trigger. Memory window of %f bytes is above the "
345
0
          "%" PRIu64 " bytes limit.",
346
0
          memory_window_bytes_.Mean(), memory_limit_bytes_);
347
0
      return true;
348
0
    }
349
0
  }
350
0
  return false;
351
0
}
352
353
0
bool Watchdog::CheckCpu_Locked(uint64_t cpu_time) {
354
0
  if (cpu_limit_percentage_ == 0)
355
0
    return false;
356
357
  // Add the cpu time to the ring buffer.
358
0
  if (cpu_window_time_ticks_.Push(cpu_time)) {
359
    // Compute the percentage over the whole window and check that it remains
360
    // under the threshold.
361
0
    uint64_t difference_ticks = cpu_window_time_ticks_.NewestWhenFull() -
362
0
                                cpu_window_time_ticks_.OldestWhenFull();
363
0
    double window_interval_ticks =
364
0
        (static_cast<double>(WindowTimeForRingBuffer(cpu_window_time_ticks_)) /
365
0
         1000.0) *
366
0
        static_cast<double>(sysconf(_SC_CLK_TCK));
367
0
    double percentage = static_cast<double>(difference_ticks) /
368
0
                        static_cast<double>(window_interval_ticks) * 100;
369
0
    if (percentage > cpu_limit_percentage_) {
370
0
      PERFETTO_ELOG("CPU watchdog trigger. %f%% CPU use is above the %" PRIu32
371
0
                    "%% CPU limit.",
372
0
                    percentage, cpu_limit_percentage_);
373
0
      return true;
374
0
    }
375
0
  }
376
0
  return false;
377
0
}
378
379
0
uint32_t Watchdog::WindowTimeForRingBuffer(const WindowedInterval& window) {
380
0
  return static_cast<uint32_t>(window.size() - 1) * polling_interval_ms_;
381
0
}
382
383
0
bool Watchdog::WindowedInterval::Push(uint64_t sample) {
384
  // Add the sample to the current position in the ring buffer.
385
0
  buffer_[position_] = sample;
386
387
  // Update the position with next one circularily.
388
0
  position_ = (position_ + 1) % size_;
389
390
  // Set the filled flag the first time we wrap.
391
0
  filled_ = filled_ || position_ == 0;
392
0
  return filled_;
393
0
}
394
395
0
double Watchdog::WindowedInterval::Mean() const {
396
0
  return MeanForArray(buffer_.get(), size_);
397
0
}
398
399
0
void Watchdog::WindowedInterval::Clear() {
400
0
  position_ = 0;
401
0
  buffer_.reset(new uint64_t[size_]());
402
0
}
403
404
836
void Watchdog::WindowedInterval::Reset(size_t new_size) {
405
836
  position_ = 0;
406
836
  size_ = new_size;
407
836
  buffer_.reset(new_size == 0 ? nullptr : new uint64_t[new_size]());
408
836
}
409
410
Watchdog::Timer::Timer(Watchdog* watchdog,
411
                       uint32_t ms,
412
                       WatchdogCrashReason crash_reason)
413
17.9k
    : watchdog_(watchdog) {
414
17.9k
  if (!ms)
415
17.9k
    return;  // No-op timer created when the watchdog is disabled.
416
11
  timer_data_.deadline = GetWallTimeMs() + std::chrono::milliseconds(ms);
417
11
  timer_data_.thread_id = GetThreadId();
418
11
  timer_data_.crash_reason = crash_reason;
419
11
  PERFETTO_DCHECK(watchdog_);
420
11
  watchdog_->AddFatalTimer(timer_data_);
421
11
}
422
423
18.0k
Watchdog::Timer::~Timer() {
424
18.0k
  if (timer_data_.deadline.count())
425
0
    watchdog_->RemoveFatalTimer(timer_data_);
426
18.0k
}
427
428
0
Watchdog::Timer::Timer(Timer&& other) noexcept {
429
0
  watchdog_ = std::move(other.watchdog_);
430
0
  other.watchdog_ = nullptr;
431
0
  timer_data_ = std::move(other.timer_data_);
432
0
  other.timer_data_ = TimerData();
433
0
}
434
435
}  // namespace base
436
}  // namespace perfetto
437
438
#endif  // PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)