1
#include "source/extensions/resource_monitors/cpu_utilization/linux_cpu_stats_reader.h"
2

            
3
#include <algorithm>
4
#include <chrono>
5
#include <sstream>
6
#include <vector>
7

            
8
#include "envoy/common/exception.h"
9
#include "envoy/common/time.h"
10

            
11
#include "source/common/common/assert.h"
12
#include "source/common/common/fmt.h"
13
#include "source/common/common/thread.h"
14

            
15
#include "absl/strings/numbers.h"
16
#include "absl/strings/str_split.h"
17
#include "absl/strings/strip.h"
18

            
19
namespace Envoy {
20
namespace Extensions {
21
namespace ResourceMonitors {
22
namespace CpuUtilizationMonitor {
23

            
24
constexpr uint64_t NUMBER_OF_CPU_TIMES_TO_PARSE =
25
    4; // we are interested in user, nice, system and idle times.
26

            
27
namespace {
28

            
29
absl::StatusOr<int> parseEffectiveCpus(absl::string_view effective_cpu_list,
30
24
                                       const std::string& effective_path) {
31
24
  int cpu_count = 0;
32
24
  std::string cpu_list = std::string(absl::StripTrailingAsciiWhitespace(effective_cpu_list));
33

            
34
  // Split by comma to handle multiple ranges/individual CPUs
35
24
  std::vector<std::string> tokens = absl::StrSplit(cpu_list, ',');
36
28
  for (const auto& token : tokens) {
37
28
    const size_t dash_pos = token.find('-');
38
28
    if (dash_pos == std::string::npos) {
39
      // Single CPU (e.g., "0" or "4")
40
11
      int single_cpu;
41
11
      if (!absl::SimpleAtoi(token, &single_cpu)) {
42
3
        return absl::InvalidArgumentError("Failed to parse CPU value");
43
3
      }
44
8
      if (single_cpu < 0) {
45
        return absl::InvalidArgumentError("Invalid CPU value");
46
      }
47
8
      cpu_count += 1;
48
20
    } else {
49
      // CPU range (e.g., "0-3" means 4 cores)
50
17
      int range_start, range_end;
51
17
      if (!absl::SimpleAtoi(token.substr(0, dash_pos), &range_start) ||
52
17
          !absl::SimpleAtoi(token.substr(dash_pos + 1), &range_end)) {
53
3
        return absl::InvalidArgumentError("Failed to parse CPU range");
54
3
      }
55
14
      if (range_start < 0 || range_end < range_start) {
56
1
        return absl::InvalidArgumentError("Invalid CPU range");
57
1
      }
58
13
      cpu_count += (range_end - range_start + 1);
59
13
    }
60
28
  }
61

            
62
17
  if (cpu_count <= 0) {
63
    ENVOY_LOG_MISC(error, "No CPUs found in {}", effective_path);
64
    return absl::InvalidArgumentError("No CPUs found");
65
  }
66

            
67
17
  return cpu_count;
68
17
}
69

            
70
16
absl::StatusOr<double> parseEffectiveCores(absl::string_view cpu_max_contents, int cpu_count) {
71
  // Parse cpu.max (format: "quota period" or "max period")
72
16
  std::istringstream max_stream{std::string(cpu_max_contents)};
73
16
  std::string quota_str, period_str;
74
16
  max_stream >> quota_str >> period_str;
75

            
76
16
  if (!max_stream) {
77
2
    return absl::InvalidArgumentError("Unexpected cpu.max format");
78
2
  }
79

            
80
14
  if (quota_str == "max") {
81
6
    return static_cast<double>(cpu_count);
82
6
  }
83

            
84
8
  int quota, period;
85
8
  if (!absl::SimpleAtoi(quota_str, &quota) || !absl::SimpleAtoi(period_str, &period)) {
86
1
    return absl::InvalidArgumentError("Failed to parse cpu.max values");
87
1
  }
88
7
  if (period <= 0) {
89
1
    return absl::InvalidArgumentError("Invalid cpu.max period");
90
1
  }
91

            
92
6
  const double q_cores = static_cast<double>(quota) / static_cast<double>(period);
93
6
  return std::min(static_cast<double>(cpu_count), q_cores);
94
7
}
95

            
96
} // namespace
97

            
98
// LinuxCpuStatsReader (Host-level CPU monitoring)
99
LinuxCpuStatsReader::LinuxCpuStatsReader(const std::string& cpu_stats_filename)
100
10
    : cpu_stats_filename_(cpu_stats_filename) {}
101

            
102
14
CpuTimesBase LinuxCpuStatsReader::getCpuTimes() {
103
14
  std::ifstream cpu_stats_file;
104
14
  cpu_stats_file.open(cpu_stats_filename_);
105
14
  if (!cpu_stats_file.is_open()) {
106
2
    ENVOY_LOG_MISC(error, "Can't open linux cpu stats file {}", cpu_stats_filename_);
107
2
    return {false, 0, 0};
108
2
  }
109

            
110
  // The first 5 bytes should be 'cpu ' without a cpu index.
111
12
  std::string buffer(5, '\0');
112
12
  cpu_stats_file.read(buffer.data(), 5);
113
12
  const std::string target = "cpu  ";
114
12
  if (!cpu_stats_file || buffer != target) {
115
1
    ENVOY_LOG_MISC(error, "Unexpected format in linux cpu stats file {}", cpu_stats_filename_);
116
1
    return {false, 0, 0};
117
1
  }
118

            
119
11
  std::array<uint64_t, NUMBER_OF_CPU_TIMES_TO_PARSE> times;
120
54
  for (uint64_t time, i = 0; i < NUMBER_OF_CPU_TIMES_TO_PARSE; ++i) {
121
44
    cpu_stats_file >> time;
122
44
    if (!cpu_stats_file) {
123
1
      ENVOY_LOG_MISC(error, "Unexpected format in linux cpu stats file {}", cpu_stats_filename_);
124
1
      return {false, 0, 0};
125
1
    }
126
43
    times[i] = time;
127
43
  }
128

            
129
10
  uint64_t work_time, total_time;
130
10
  work_time = times[0] + times[1] + times[2]; // user + nice + system
131
10
  total_time = work_time + times[3];          // idle
132
10
  return {true, static_cast<double>(work_time), total_time};
133
11
}
134

            
135
10
absl::StatusOr<double> LinuxCpuStatsReader::getUtilization() {
136
10
  CpuTimesBase current_cpu_times = getCpuTimes();
137

            
138
10
  if (!current_cpu_times.is_valid) {
139
1
    return absl::InvalidArgumentError("Failed to read CPU times");
140
1
  }
141

            
142
  // For the first call, initialize previous times and return 0
143
9
  if (!previous_cpu_times_.is_valid) {
144
5
    previous_cpu_times_ = current_cpu_times;
145
5
    return 0.0;
146
5
  }
147

            
148
4
  const double work_over_period = current_cpu_times.work_time - previous_cpu_times_.work_time;
149
4
  const int64_t total_over_period = current_cpu_times.total_time - previous_cpu_times_.total_time;
150

            
151
4
  if (work_over_period < 0 || total_over_period <= 0) {
152
3
    return absl::InvalidArgumentError(
153
3
        fmt::format("Erroneous CPU stats calculation. Work_over_period='{}' cannot "
154
3
                    "be a negative number and total_over_period='{}' must be a positive number.",
155
3
                    work_over_period, total_over_period));
156
3
  }
157

            
158
1
  const double utilization = work_over_period / total_over_period;
159

            
160
  // Update previous times for the next call
161
1
  previous_cpu_times_ = current_cpu_times;
162

            
163
1
  return utilization;
164
4
}
165

            
166
LinuxContainerCpuStatsReader::ContainerStatsReaderPtr
167
5
LinuxContainerCpuStatsReader::create(Filesystem::Instance& fs, TimeSource& time_source) {
168
  // Check if host supports cgroup v2
169
5
  if (CpuPaths::isV2(fs)) {
170
3
    return std::make_unique<CgroupV2CpuStatsReader>(fs, time_source);
171
3
  }
172

            
173
  // Check if host supports cgroup v1
174
2
  if (CpuPaths::isV1(fs)) {
175
1
    return std::make_unique<CgroupV1CpuStatsReader>(fs, time_source);
176
1
  }
177

            
178
1
  throw EnvoyException(std::string(NoSupportedCGroupMessage));
179
2
}
180

            
181
CgroupV1CpuStatsReader::CgroupV1CpuStatsReader(Filesystem::Instance& fs, TimeSource& time_source)
182
1
    : LinuxContainerCpuStatsReader(fs, time_source), shares_path_(CpuPaths::V1::getSharesPath()),
183
1
      usage_path_(CpuPaths::V1::getUsagePath()) {}
184

            
185
CgroupV1CpuStatsReader::CgroupV1CpuStatsReader(Filesystem::Instance& fs, TimeSource& time_source,
186
                                               const std::string& shares_path,
187
                                               const std::string& usage_path)
188
8
    : LinuxContainerCpuStatsReader(fs, time_source), shares_path_(shares_path),
189
8
      usage_path_(usage_path) {}
190

            
191
10
CpuTimesBase CgroupV1CpuStatsReader::getCpuTimes() {
192
  // Read cpu.shares (cpu allocated)
193
10
  auto shares_result = fs_.fileReadToEnd(shares_path_);
194
10
  if (!shares_result.ok()) {
195
2
    ENVOY_LOG(error, "Unable to read CPU shares file at {}", shares_path_);
196
2
    return {false, 0, 0};
197
2
  }
198

            
199
  // Read cpuacct.usage (cpu times)
200
8
  auto usage_result = fs_.fileReadToEnd(usage_path_);
201
8
  if (!usage_result.ok()) {
202
1
    ENVOY_LOG(error, "Unable to read CPU usage file at {}", usage_path_);
203
1
    return {false, 0, 0};
204
1
  }
205

            
206
7
  double cpu_allocated_value;
207
7
  if (!absl::SimpleAtod(shares_result.value(), &cpu_allocated_value)) {
208
1
    ENVOY_LOG(error, "Failed to parse CPU shares value: {}", shares_result.value());
209
1
    return {false, 0, 0};
210
1
  }
211

            
212
6
  double cpu_times_value;
213
6
  if (!absl::SimpleAtod(usage_result.value(), &cpu_times_value)) {
214
1
    ENVOY_LOG(error, "Failed to parse CPU usage value: {}", usage_result.value());
215
1
    return {false, 0, 0};
216
1
  }
217

            
218
5
  if (cpu_allocated_value <= 0) {
219
1
    ENVOY_LOG(error, "Invalid CPU shares value: {}", cpu_allocated_value);
220
1
    return {false, 0, 0};
221
1
  }
222

            
223
4
  const uint64_t current_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
224
4
                                    time_source_.monotonicTime().time_since_epoch())
225
4
                                    .count();
226

            
227
  // cpu_times is in nanoseconds, cpu_allocated shares is in millicores
228
4
  const double work_time = (cpu_times_value * CONTAINER_MILLICORES_PER_CORE) / cpu_allocated_value;
229

            
230
4
  ENVOY_LOG(trace, "cgroupv1 cpu_times_value: {}, cpu_allocated_value: {}, current_time: {}",
231
4
            cpu_times_value, cpu_allocated_value, current_time);
232

            
233
4
  return {true, work_time, current_time};
234
5
}
235

            
236
3
absl::StatusOr<double> CgroupV1CpuStatsReader::getUtilization() {
237
3
  CpuTimesBase current_cpu_times = getCpuTimes();
238

            
239
3
  if (!current_cpu_times.is_valid) {
240
1
    return absl::InvalidArgumentError("Failed to read CPU times");
241
1
  }
242

            
243
2
  if (!previous_cpu_times_.is_valid) {
244
1
    previous_cpu_times_ = current_cpu_times;
245
1
    return 0.0;
246
1
  }
247

            
248
1
  const double work_over_period = current_cpu_times.work_time - previous_cpu_times_.work_time;
249
1
  const int64_t total_over_period = current_cpu_times.total_time - previous_cpu_times_.total_time;
250

            
251
1
  if (work_over_period < 0 || total_over_period <= 0) {
252
1
    return absl::InvalidArgumentError(
253
1
        fmt::format("Erroneous CPU stats calculation. Work_over_period='{}' cannot "
254
1
                    "be a negative number and total_over_period='{}' must be a positive number.",
255
1
                    work_over_period, total_over_period));
256
1
  }
257

            
258
  const double utilization = work_over_period / total_over_period;
259

            
260
  previous_cpu_times_ = current_cpu_times;
261

            
262
  return utilization;
263
1
}
264

            
265
CgroupV2CpuStatsReader::CgroupV2CpuStatsReader(Filesystem::Instance& fs, TimeSource& time_source)
266
3
    : LinuxContainerCpuStatsReader(fs, time_source), stat_path_(CpuPaths::V2::getStatPath()),
267
3
      max_path_(CpuPaths::V2::getMaxPath()), effective_path_(CpuPaths::V2::getEffectiveCpusPath()) {
268
3
}
269

            
270
CgroupV2CpuStatsReader::CgroupV2CpuStatsReader(Filesystem::Instance& fs, TimeSource& time_source,
271
                                               const std::string& stat_path,
272
                                               const std::string& max_path,
273
                                               const std::string& effective_path)
274
23
    : LinuxContainerCpuStatsReader(fs, time_source), stat_path_(stat_path), max_path_(max_path),
275
23
      effective_path_(effective_path) {}
276

            
277
29
CpuTimesV2 CgroupV2CpuStatsReader::getCpuTimes() {
278
  // Read cpu.stat for usage_usec
279
29
  auto stat_result = fs_.fileReadToEnd(stat_path_);
280
29
  if (!stat_result.ok()) {
281
2
    ENVOY_LOG(error, "Unable to read CPU stat file at {}", stat_path_);
282
2
    return {false, 0, 0, 0};
283
2
  }
284

            
285
  // Parse usage_usec from cpu.stat
286
27
  uint64_t usage_usec = 0;
287
27
  bool found_usage = false;
288
27
  std::istringstream stat_stream(stat_result.value());
289
27
  std::string line;
290

            
291
29
  while (std::getline(stat_stream, line)) {
292
28
    if (line.rfind("usage_usec ", 0) == 0) {
293
      // Line starts with "usage_usec "
294
26
      const size_t pos = line.find_last_of(' ');
295
26
      if (pos != std::string::npos) {
296
26
        if (!absl::SimpleAtoi(line.substr(pos + 1), &usage_usec)) {
297
1
          ENVOY_LOG(error, "Failed to parse usage_usec in cpu.stat file {}", stat_path_);
298
1
          return {false, 0, 0, 0};
299
1
        }
300
25
        found_usage = true;
301
25
      }
302
25
      break;
303
26
    }
304
28
  }
305

            
306
26
  if (!found_usage) {
307
1
    ENVOY_LOG(trace, "Missing usage_usec in cpu.stat file {}", stat_path_);
308
1
    return {false, 0, 0, 0};
309
1
  }
310

            
311
  // Read cpuset.cpus.effective
312
25
  auto effective_result = fs_.fileReadToEnd(effective_path_);
313
25
  if (!effective_result.ok()) {
314
1
    ENVOY_LOG(error, "Unable to read effective CPUs file at {}", effective_path_);
315
1
    return {false, 0, 0, 0};
316
1
  }
317

            
318
  // Parse effective CPUs
319
  // Format can be: "0", "0-3", "0,2,4", "0-2,4", "0-3,5-7", etc.
320
24
  absl::StatusOr<int> cpu_count = parseEffectiveCpus(effective_result.value(), effective_path_);
321
24
  if (!cpu_count.ok()) {
322
7
    ENVOY_LOG(error, "Failed to parse effective CPUs file {}: {}", effective_path_,
323
7
              cpu_count.status().message());
324
7
    return {false, 0, 0, 0};
325
7
  }
326
17
  const int N = cpu_count.value();
327

            
328
  // Read cpu.max
329
17
  auto max_result = fs_.fileReadToEnd(max_path_);
330
17
  if (!max_result.ok()) {
331
1
    ENVOY_LOG(error, "Unable to read CPU max file at {}", max_path_);
332
1
    return {false, 0, 0, 0};
333
1
  }
334

            
335
16
  absl::StatusOr<double> effective_cores = parseEffectiveCores(max_result.value(), N);
336
16
  if (!effective_cores.ok()) {
337
4
    ENVOY_LOG(error, "Failed to parse cpu.max file {}: {}", max_path_,
338
4
              effective_cores.status().message());
339
4
    return {false, 0, 0, 0};
340
4
  }
341

            
342
  // Convert usage from usec to match our time units
343
12
  const double cpu_times_value_us = static_cast<double>(usage_usec);
344
12
  const uint64_t current_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
345
12
                                    time_source_.monotonicTime().time_since_epoch())
346
12
                                    .count();
347

            
348
12
  ENVOY_LOG(trace, "cgroupv2 usage_usec: {}, effective_cores: {}, current_time: {}", usage_usec,
349
12
            effective_cores.value(), current_time);
350

            
351
12
  return {true, cpu_times_value_us, current_time, effective_cores.value()};
352
16
}
353

            
354
8
absl::StatusOr<double> CgroupV2CpuStatsReader::getUtilization() {
355
8
  CpuTimesV2 current_cpu_times = getCpuTimes();
356

            
357
8
  if (!current_cpu_times.is_valid) {
358
3
    return absl::InvalidArgumentError("Failed to read CPU times");
359
3
  }
360

            
361
  // For the first call, initialize previous times and return 0
362
5
  if (!previous_cpu_times_.is_valid) {
363
3
    previous_cpu_times_ = current_cpu_times;
364
3
    return 0.0;
365
3
  }
366

            
367
  // CgroupV2-specific calculation with unit conversions and effective cores
368
2
  const double work_over_period = current_cpu_times.work_time - previous_cpu_times_.work_time;
369
2
  const int64_t total_over_period = current_cpu_times.total_time - previous_cpu_times_.total_time;
370

            
371
2
  if (work_over_period < 0 || total_over_period <= 0) {
372
1
    return absl::InvalidArgumentError(
373
1
        fmt::format("Erroneous CPU stats calculation. Work_over_period='{}' cannot "
374
1
                    "be a negative number and total_over_period='{}' must be a positive number.",
375
1
                    work_over_period, total_over_period));
376
1
  }
377

            
378
  // Convert nanoseconds to seconds and microseconds to seconds
379
1
  const double total_over_period_seconds = total_over_period / 1000000000.0;
380
1
  const double work_over_period_seconds = work_over_period / 1000000.0;
381

            
382
  // Calculate utilization considering effective cores
383
1
  const double utilization =
384
1
      work_over_period_seconds / (total_over_period_seconds * current_cpu_times.effective_cores);
385

            
386
  // Clamp to [0.0, 1.0]
387
1
  const double clamped_utilization = std::clamp(utilization, 0.0, 1.0);
388

            
389
  // Update previous times for next call
390
1
  previous_cpu_times_ = current_cpu_times;
391

            
392
1
  return clamped_utilization;
393
2
}
394

            
395
} // namespace CpuUtilizationMonitor
396
} // namespace ResourceMonitors
397
} // namespace Extensions
398
} // namespace Envoy