1
#if !defined(__linux__)
2
#error "Linux platform file is part of non-Linux build."
3
#endif
4

            
5
#include "cilium/privileged_service_client.h"
6

            
7
#include <asm-generic/socket.h>
8
#include <linux/capability.h>
9
#include <linux/limits.h>
10
#include <sys/socket.h>
11
#include <sys/types.h>
12
#include <unistd.h>
13

            
14
#include <cerrno>
15
#include <climits>
16
#include <cstddef>
17
#include <cstdint>
18
#include <cstring>
19
#include <string>
20

            
21
#include "envoy/api/os_sys_calls_common.h"
22

            
23
#include "source/common/common/assert.h"
24
#include "source/common/common/lock_guard.h"
25
#include "source/common/common/logger.h"
26

            
27
#include "starter/privileged_service_protocol.h"
28

            
29
namespace Envoy {
30
namespace Cilium {
31
namespace PrivilegedService {
32

            
33
ProtocolClient::ProtocolClient() : Protocol(CILIUM_PRIVILEGED_SERVICE_FD), seq_(0) {
34
  // Check that the Envoy process isn't running with privileges.
35
  // The only exception is CAP_NET_BIND_SERVICE (if explicitly excluded from being dropped).
36
  RELEASE_ASSERT((getCapabilities(CAP_EFFECTIVE) & ~(1UL << CAP_NET_BIND_SERVICE)) == 0 &&
37
                     (getCapabilities(CAP_PERMITTED) & ~(1UL << CAP_NET_BIND_SERVICE)) == 0,
38
                 "cilium-envoy running with privileges, exiting");
39

            
40
  if (!checkPrivilegedService()) {
41
    ENVOY_LOG(warn, "Cilium privileged service not present");
42
    // No Cilium privileged service detected
43
    close();
44
  }
45

            
46
  // Validate that direct SO_MARK is now prohibited
47
  int sockfd = ::socket(AF_INET, SOCK_STREAM, 0);
48
  RELEASE_ASSERT(sockfd >= 0, "socket failed");
49

            
50
  uint32_t mark = 12345;
51
  int rc = ::setsockopt(sockfd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark));
52
  RELEASE_ASSERT(rc == -1, "setsockopt");
53
  RELEASE_ASSERT(errno == EPERM, "setsockopt");
54

            
55
  ::close(sockfd);
56
}
57

            
58
ssize_t ProtocolClient::transact(MessageHeader& req, size_t req_len, const void* data,
59
                                 size_t data_len, int* fd, Response& resp, void* buf,
60
                                 size_t buf_size) {
61
  RELEASE_ASSERT(buf_size <= RESPONSE_BUF_SIZE, "ProtocolClient::transact: invalid bufsize");
62
  uint32_t seq;
63

            
64
  // get next atomic sequence number
65
  do {
66
    seq = ++seq_;
67
  } while (seq == 0); // zero is reserved for "no response"
68
  req.msg_seq_ = seq;
69

            
70
  // Set up a waiter in the stack before we send anything so that the waiter exists as soon as it is
71
  // possible for a concurrent receiver to receive the response.
72
  Waiter waiter;
73
  insert(seq, &waiter);
74

            
75
  // send message after a waiter has been established.
76
  ssize_t size = sendFdMsg(&req, req_len, data, data_len, *fd);
77

            
78
  if (size > 0) {
79
    RELEASE_ASSERT(size_t(size) == req_len + data_len, "truncated request");
80

            
81
    // receive removes the waiter from 'waiters_' before returning
82
    receive(waiter, seq);
83
  } else {
84
    remove(seq);
85
  }
86

            
87
  return waiter.getResponse(seq, resp, buf, buf_size, fd);
88
}
89

            
90
// receive
91
// 1. Waits to become the receiver, checking for a response one each wake-up
92
// 2. Loops receiving responses when becoming the exclusive receiver,
93
//    passing resonses to other waiters until its own response is received.
94
// 3. Removes the waiter from 'waiters_' before returning.
95
void ProtocolClient::receive(Waiter& waiter, uint32_t seq) noexcept {
96
  // Loop waiting until we have a response or become the receiver.
97
  // 'mutex_' is released when exiting the loop.
98
  bool done = false;
99
  bool receiver_active;
100
  {
101
    Thread::LockGuard guard(mutex_);
102
    while (true) {
103
      // Check if we have our response.
104
      if (waiter.seq() != 0) {
105
        waiters_.erase(seq);
106
        receiver_active = is_receiver_active_;
107
        done = true;
108
        break;
109
      }
110

            
111
      // Check if we can become the receiver.
112
      if (!is_receiver_active_) {
113
        receiver_active = is_receiver_active_ = true;
114
        break;
115
      }
116

            
117
      // 'mutex_' is released for the duration of the wait.
118
      wait();
119
    }
120
  }
121

            
122
  // mutex_ not held any more
123
  // Return if done
124
  if (done) {
125
    if (!receiver_active) {
126
      // Notify another waiter (if any) to possibly become the new receiver.
127
      // This sure there always is a receiver if there are any waiters.
128
      notifyOne();
129
    }
130
    return;
131
  }
132

            
133
  // No locks are held, but we just exclusively set the is_receiver_active_ = true above.
134
  // Receiver accesses it's own waiter (the 'waiter') without locking.
135
  // Other waiters are accessed only while holding 'mutex_'.
136

            
137
  // Receive until we have a response or an error
138
  while (true) {
139
    ssize_t size = waiter.recvFdMsg(*this);
140
    if (size < 0) {
141
      ENVOY_LOG(debug, "privileged service failed with {} (errno {})", size, errno);
142
      break;
143
    }
144

            
145
    // Is the response for us?
146
    if (waiter.seq() == seq) {
147
      break;
148
    }
149

            
150
    // The response is for one of the waiters, pass it on
151
    {
152
      Thread::LockGuard guard(mutex_);
153
      auto it = waiters_.find(waiter.seq());
154
      RELEASE_ASSERT(it != waiters_.end(), fmt::format("no waiter found for seq {}", waiter.seq()));
155
      // copy received data to the found waiter
156
      *it->second = waiter;
157
      // clear the waiter of the current receiver
158
      waiter.clear();
159
    }
160

            
161
    // have to notify all waiters for the right one to be woken up from the wait.
162
    notifyAll();
163
  }
164

            
165
  // Pass receiver duties to one of the other waiters & remove the waiter from 'waiters_' while we
166
  // still have the mutex.
167
  {
168
    Thread::LockGuard guard(mutex_);
169
    is_receiver_active_ = false;
170
    waiters_.erase(seq);
171
  }
172

            
173
  // wake up one waiter to take the receiver role
174
  notifyOne();
175
}
176

            
177
bool ProtocolClient::checkPrivilegedService() {
178
  // Dump the effective capabilities of the privileged service process
179
  DumpRequest req;
180
  Response resp;
181
  uint8_t buf[RESPONSE_BUF_SIZE];
182
  int fd = -1;
183

            
184
  ssize_t size = transact(req.hdr_, sizeof(req), nullptr, 0, &fd, resp, buf, sizeof(buf));
185
  if (size < ssize_t(sizeof(resp))) {
186
    ENVOY_LOG_MISC(warn, "Cilium privileged service detection failed with return code: {}", size);
187
    return false;
188
  }
189
  std::string str(reinterpret_cast<char*>(buf), size - sizeof(resp));
190
  ENVOY_LOG_MISC(debug, "Cilium privileged service detected with following capabilities: {}", str);
191
  return true;
192
}
193

            
194
Envoy::Api::SysCallIntResult ProtocolClient::bpfOpen(const char* path) {
195
  if (!haveCiliumPrivilegedService()) {
196
    return {-1, EPERM};
197
  }
198

            
199
  BpfOpenRequest req;
200
  Response resp;
201
  size_t path_len = strlen(path);
202
  RELEASE_ASSERT(path_len <= PATH_MAX, "bpf open path too long");
203
  int fd = -1;
204
  ssize_t size = transact(req.hdr_, sizeof(req), path, path_len, &fd, resp);
205
  RELEASE_ASSERT(size == ssize_t(sizeof(resp)), "invalid received response size");
206
  if (resp.return_value_ == INT_MAX) {
207
    resp.return_value_ = fd;
208
  }
209
  return Envoy::Api::SysCallIntResult{resp.return_value_, resp.errno_};
210
}
211

            
212
Envoy::Api::SysCallIntResult ProtocolClient::bpfLookup(int fd, const void* key, uint32_t key_size,
213
                                                       void* value, uint32_t value_size) {
214
  if (!haveCiliumPrivilegedService()) {
215
    return {-1, EPERM};
216
  }
217

            
218
  BpfLookupRequest req(value_size);
219
  Response resp;
220
  ssize_t size = transact(req.hdr_, sizeof(req), key, key_size, &fd, resp, value, value_size);
221
  RELEASE_ASSERT((size == ssize_t(sizeof(resp)) && resp.return_value_ == -1) ||
222
                     size == ssize_t(sizeof(resp) + value_size),
223
                 "invalid received bpf lookup value size");
224
  return Envoy::Api::SysCallIntResult{resp.return_value_, resp.errno_};
225
}
226

            
227
Envoy::Api::SysCallIntResult ProtocolClient::setsockopt(int sockfd, int level, int optname,
228
                                                        const void* optval, socklen_t optlen) {
229
  if (!haveCiliumPrivilegedService()) {
230
    return {-1, EPERM};
231
  }
232

            
233
  SetSockOptRequest req(level, optname, optval, optlen);
234
  Response resp;
235
  ssize_t size = transact(req.hdr_, sizeof(req), nullptr, 0, &sockfd, resp);
236
  RELEASE_ASSERT(size == ssize_t(sizeof(resp)), "invalid received response size");
237
  return Envoy::Api::SysCallIntResult{resp.return_value_, resp.errno_};
238
}
239

            
240
} // namespace PrivilegedService
241
} // namespace Cilium
242
} // namespace Envoy