1
#include "cilium/bpf_metadata.h"
2

            
3
#include <fmt/format.h>
4
#include <netinet/in.h>
5
#include <netinet/tcp.h>
6

            
7
#include <chrono>
8
#include <cstddef>
9
#include <cstdint>
10
#include <memory>
11
#include <string>
12
#include <utility>
13
#include <vector>
14

            
15
#include "envoy/api/io_error.h"
16
#include "envoy/common/exception.h"
17
#include "envoy/config/core/v3/socket_option.pb.h"
18
#include "envoy/network/address.h"
19
#include "envoy/network/filter.h"
20
#include "envoy/network/listen_socket.h"
21
#include "envoy/network/listener_filter_buffer.h"
22
#include "envoy/network/socket.h"
23
#include "envoy/registry/registry.h"
24
#include "envoy/server/factory_context.h"
25
#include "envoy/server/filter_config.h"
26
#include "envoy/singleton/manager.h"
27
#include "envoy/stream_info/filter_state.h"
28

            
29
#include "source/common/common/logger.h"
30
#include "source/common/common/utility.h"
31
#include "source/common/network/address_impl.h"
32
#include "source/common/network/socket_option_factory.h"
33
#include "source/common/network/socket_option_impl.h"
34
#include "source/common/network/utility.h"
35
#include "source/common/protobuf/protobuf.h"
36
#include "source/common/protobuf/utility.h"
37

            
38
#include "absl/strings/string_view.h"
39
#include "absl/types/optional.h"
40
#include "cilium/api/bpf_metadata.pb.h"
41
#include "cilium/api/bpf_metadata.pb.validate.h" // IWYU pragma: keep
42
#include "cilium/conntrack.h"
43
#include "cilium/filter_state_cilium_destination.h"
44
#include "cilium/filter_state_cilium_policy.h"
45
#include "cilium/host_map.h"
46
#include "cilium/ipcache.h"
47
#include "cilium/network_policy.h"
48
#include "cilium/policy_id.h"
49
#include "cilium/socket_option_cilium_mark.h"
50
#include "cilium/socket_option_ip_transparent.h"
51

            
52
namespace Envoy {
53
namespace Server {
54
namespace Configuration {
55

            
56
/**
57
 * Config registration for the bpf metadata filter. @see
58
 * NamedNetworkFilterConfigFactory.
59
 */
60
class BpfMetadataConfigFactory : public NamedListenerFilterConfigFactory {
61
public:
62
  // NamedListenerFilterConfigFactory
63
  Network::ListenerFilterFactoryCb createListenerFilterFactoryFromProto(
64
      const Protobuf::Message& proto_config,
65
      const Network::ListenerFilterMatcherSharedPtr& listener_filter_matcher,
66
      Configuration::ListenerFactoryContext& context) override {
67

            
68
    auto config = std::make_shared<Cilium::BpfMetadata::Config>(
69
        MessageUtil::downcastAndValidate<const ::cilium::BpfMetadata&>(
70
            proto_config, context.messageValidationVisitor()),
71
        context);
72

            
73
    // Set the SO_MARK (Cilium Mark), IP_TRANSPARENT & SO_REUSEADDR for the listen socket.
74
    std::shared_ptr<Envoy::Network::Socket::Options> options =
75
        std::make_shared<Envoy::Network::Socket::Options>();
76

            
77
    // For the listener socket, the BPF datapath is only interested
78
    // in whether the proxy is ingress, egress, or if there is no proxy at all.
79
    uint32_t mark = (config->is_ingress_) ? 0x0A00 : 0x0B00;
80
    options->push_back(std::make_shared<Cilium::CiliumMarkSocketOption>(mark));
81

            
82
    options->push_back(std::make_shared<Cilium::IpTransparentSocketOption>());
83

            
84
    options->push_back(std::make_shared<Envoy::Network::SocketOptionImpl>(
85
        envoy::config::core::v3::SocketOption::STATE_PREBIND,
86
        Envoy::Network::SocketOptionName(SOL_SOCKET, SO_REUSEADDR, "SO_REUSEADDR"), 1));
87

            
88
    // SO_REUSEPORT for the listener socket is set via Envoy config
89

            
90
    context.addListenSocketOptions(options);
91

            
92
    return [listener_filter_matcher,
93
            config](Network::ListenerFilterManager& filter_manager) mutable -> void {
94
      filter_manager.addAcceptFilter(listener_filter_matcher,
95
                                     std::make_unique<Cilium::BpfMetadata::Instance>(config));
96
    };
97
  }
98

            
99
8
  ProtobufTypes::MessagePtr createEmptyConfigProto() override {
100
8
    return std::make_unique<::cilium::BpfMetadata>();
101
8
  }
102

            
103
131
  std::string name() const override { return "cilium.bpf_metadata"; }
104
};
105

            
106
/**
107
 * Static registration for the bpf metadata filter. @see RegisterFactory.
108
 * Versioning started from 1.1.0 for Cilium version 1.12.0.
109
 */
110
REGISTER_FACTORY(BpfMetadataConfigFactory,
111
                 NamedListenerFilterConfigFactory){FACTORY_VERSION(1, 1, 0, {{}})};
112

            
113
/**
114
 * Config registration for the UDP bpf metadata filter. @see
115
 * NamedUdpListenerFilterConfigFactory.
116
 */
117
class UdpBpfMetadataConfigFactory : public NamedUdpListenerFilterConfigFactory {
118
public:
119
  // NamedUdpListenerFilterConfigFactory
120
  Network::UdpListenerFilterFactoryCb
121
  createFilterFactoryFromProto(const Protobuf::Message& proto_config,
122
                               Configuration::ListenerFactoryContext& context) override {
123

            
124
    auto config = std::make_shared<Cilium::BpfMetadata::Config>(
125
        MessageUtil::downcastAndValidate<const ::cilium::BpfMetadata&>(
126
            proto_config, context.messageValidationVisitor()),
127
        context);
128

            
129
    // Set the SO_MARK (Cilium Mark), IP_TRANSPARENT & SO_REUSEADDR for the listen socket.
130
    std::shared_ptr<Envoy::Network::Socket::Options> options =
131
        std::make_shared<Envoy::Network::Socket::Options>();
132

            
133
    // For the listener socket, the BPF datapath is only interested
134
    // in whether the proxy is ingress, egress, or if there is no proxy at all.
135
    uint32_t mark = (config->is_ingress_) ? 0x0A00 : 0x0B00;
136
    options->push_back(std::make_shared<Cilium::CiliumMarkSocketOption>(mark));
137

            
138
    options->push_back(std::make_shared<Cilium::IpTransparentSocketOption>());
139

            
140
    options->push_back(std::make_shared<Envoy::Network::SocketOptionImpl>(
141
        envoy::config::core::v3::SocketOption::STATE_PREBIND,
142
        Envoy::Network::SocketOptionName(SOL_SOCKET, SO_REUSEADDR, "SO_REUSEADDR"), 1));
143

            
144
    // SO_REUSEPORT for the listener socket is set via Envoy config
145

            
146
    context.addListenSocketOptions(options);
147

            
148
    return [config](Network::UdpListenerFilterManager& udp_listener_filter_manager,
149
                    Network::UdpReadFilterCallbacks& callbacks) mutable -> void {
150
      udp_listener_filter_manager.addReadFilter(
151
          std::make_unique<Cilium::BpfMetadata::UdpInstance>(config, callbacks));
152
    };
153
  }
154

            
155
8
  ProtobufTypes::MessagePtr createEmptyConfigProto() override {
156
8
    return std::make_unique<::cilium::BpfMetadata>();
157
8
  }
158

            
159
131
  std::string name() const override { return "cilium.bpf_metadata"; }
160
};
161

            
162
/**
163
 * Static registration for the UDP bpf metadata filter. @see RegisterFactory.
164
 */
165
REGISTER_FACTORY(UdpBpfMetadataConfigFactory,
166
                 NamedUdpListenerFilterConfigFactory){FACTORY_VERSION(1, 1, 0, {{}})};
167

            
168
} // namespace Configuration
169
} // namespace Server
170

            
171
namespace Cilium {
172
namespace BpfMetadata {
173

            
174
// Singleton registration via macro defined in envoy/singleton/manager.h
175
SINGLETON_MANAGER_REGISTRATION(cilium_bpf_conntrack);
176
SINGLETON_MANAGER_REGISTRATION(cilium_host_map);
177
SINGLETON_MANAGER_REGISTRATION(cilium_network_policy);
178

            
179
namespace {
180

            
181
std::shared_ptr<const Cilium::PolicyHostMap>
182
createHostMap(Server::Configuration::ListenerFactoryContext& context) {
183
  return context.serverFactoryContext().singletonManager().getTyped<const Cilium::PolicyHostMap>(
184
      SINGLETON_MANAGER_REGISTERED_NAME(cilium_host_map), [&context] {
185
        auto map = std::make_shared<Cilium::PolicyHostMap>(context.serverFactoryContext());
186
        map->startSubscription(context.serverFactoryContext());
187
        return map;
188
      });
189
}
190

            
191
std::shared_ptr<const Cilium::NetworkPolicyMap>
192
createPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct) {
193
  return context.serverFactoryContext().singletonManager().getTyped<const Cilium::NetworkPolicyMap>(
194
      SINGLETON_MANAGER_REGISTERED_NAME(cilium_network_policy),
195
      [&context, &ct] { return std::make_shared<Cilium::NetworkPolicyMap>(context, ct); });
196
}
197

            
198
} // namespace
199

            
200
Config::Config(const ::cilium::BpfMetadata& config,
201
               Server::Configuration::ListenerFactoryContext& context)
202
129
    : so_linger_(config.has_original_source_so_linger_time()
203
129
                     ? config.original_source_so_linger_time()
204
129
                     : -1),
205
129
      proxy_id_(config.proxy_id()), is_ingress_(config.is_ingress()),
206
129
      use_original_source_address_(config.use_original_source_address()),
207
129
      is_l7lb_(config.is_l7lb()),
208
      ipv4_source_address_(
209
129
          Network::Utility::parseInternetAddressNoThrow(config.ipv4_source_address())),
210
      ipv6_source_address_(
211
129
          Network::Utility::parseInternetAddressNoThrow(config.ipv6_source_address())),
212
129
      enforce_policy_on_l7lb_(config.enforce_policy_on_l7lb()),
213
129
      l7lb_policy_name_(config.l7lb_policy_name()),
214
129
      ipcache_entry_ttl_(
215
129
          PROTOBUF_GET_MS_OR_DEFAULT(config, cache_entry_ttl, DEFAULT_CACHE_ENTRY_TTL_MS)),
216
129
      random_(context.serverFactoryContext().api().randomGenerator()) {
217
129
  if (is_l7lb_ && is_ingress_) {
218
1
    throw EnvoyException("cilium.bpf_metadata: is_l7lb may not be set with is_ingress");
219
1
  }
220
128
  if ((ipv4_source_address_ &&
221
128
       ipv4_source_address_->ip()->version() != Network::Address::IpVersion::v4) ||
222
128
      (!ipv4_source_address_ && config.ipv4_source_address().length() > 0)) {
223
1
    throw EnvoyException(
224
1
        fmt::format("cilium.bpf_metadata: ipv4_source_address is not an IPv4 address: {}",
225
1
                    config.ipv4_source_address()));
226
1
  }
227
127
  if ((ipv6_source_address_ &&
228
127
       ipv6_source_address_->ip()->version() != Network::Address::IpVersion::v6) ||
229
127
      (!ipv6_source_address_ && config.ipv6_source_address().length() > 0)) {
230
1
    throw EnvoyException(
231
1
        fmt::format("cilium.bpf_metadata: ipv6_source_address is not an IPv6 address: {}",
232
1
                    config.ipv6_source_address()));
233
1
  }
234

            
235
126
  if (config.use_nphds()) {
236
    hosts_ = createHostMap(context);
237
  }
238

            
239
  // Note: all instances use the bpf root of the first filter with non-empty
240
  // bpf_root instantiated! Only try opening bpf maps if bpf root is explicitly
241
  // configured
242
126
  std::string bpf_root = config.bpf_root();
243
126
  if (bpf_root.length() > 0) {
244
    ct_maps_ = context.serverFactoryContext().singletonManager().getTyped<Cilium::CtMap>(
245
        SINGLETON_MANAGER_REGISTERED_NAME(cilium_bpf_conntrack), [&bpf_root] {
246
          // Even if opening the global maps fail, local maps may still succeed
247
          // later.
248
          return std::make_shared<Cilium::CtMap>(bpf_root);
249
        });
250

            
251
    if (bpf_root != ct_maps_->bpfRoot()) {
252
      // bpf root may not change during runtime
253
      throw EnvoyException(fmt::format("cilium.bpf_metadata: Invalid bpf_root: {}", bpf_root));
254
    }
255

            
256
    if (!hosts_) {
257
      std::string ipcache_name = "cilium_ipcache";
258
      if (config.ipcache_name().length() > 0) {
259
        ipcache_name = config.ipcache_name();
260
      }
261
      ipcache_ = IpCache::newIpCache(
262
          context.serverFactoryContext(), bpf_root + "/tc/globals/" + ipcache_name,
263
          std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(config, cache_gc_interval,
264
                                                               10 * DEFAULT_CACHE_ENTRY_TTL_MS)));
265
    }
266
  }
267

            
268
  // Get the shared policy provider, or create it if not already created.
269
  // Note that the API config source is assumed to be the same for all filter
270
  // instances!
271
  // Only created if either ipcache_ or hosts_ map exists
272
126
  if (ipcache_ || hosts_) {
273
    npmap_ = createPolicyMap(context, ct_maps_);
274
  }
275
126
}
276

            
277
167
uint32_t Config::resolvePolicyId(const Network::Address::Ip* ip) const {
278
167
  uint32_t id = 0;
279

            
280
167
  if (hosts_ != nullptr) {
281
167
    id = hosts_->resolve(ip);
282
167
  } else if (ipcache_ != nullptr) {
283
    std::chrono::microseconds ttl = ipcache_entry_ttl_;
284
    // subtract random jitter (0-1ms) if configured as at least 1ms
285
    if (ttl >= std::chrono::milliseconds(1)) {
286
      ttl -= std::chrono::microseconds(random_.random() % 1000);
287
    }
288
    id = ipcache_->resolve(ip, ttl);
289
  }
290

            
291
  // default destination identity to the world if needed
292
167
  if (id == 0) {
293
22
    id = Cilium::ID::WORLD;
294
22
    ENVOY_LOG(trace, "bpf_metadata: Identity for IP defaults to WORLD", ip->addressAsString());
295
22
  }
296

            
297
167
  return id;
298
167
}
299

            
300
uint32_t Config::resolveSourceIdentity(const PolicyInstance& policy,
301
                                       const Network::Address::Ip* sip,
302
                                       const Network::Address::Ip* dip, bool ingress,
303
10
                                       bool is_l7_lb) {
304
10
  uint32_t source_identity = 0;
305

            
306
  // Resolve the source security ID from conntrack map, or from ip cache
307
10
  if (ct_maps_ != nullptr) {
308
    const std::string& ct_name = policy.conntrackName();
309
    if (ct_name.length() > 0) {
310
      source_identity = ct_maps_->lookupSrcIdentity(ct_name, sip, dip, ingress);
311
    } else if (is_l7_lb) {
312
      // non-local source should be in the global conntrack
313
      source_identity = ct_maps_->lookupSrcIdentity("global", sip, dip, ingress);
314
    }
315
  }
316
  // Fall back to ipcache lookup if conntrack entry can not be located
317
10
  if (source_identity == 0) {
318
10
    source_identity = resolvePolicyId(sip);
319
10
  }
320

            
321
10
  return source_identity;
322
10
}
323

            
324
// Returns a new IpAddressPair that fills the port from 'source_address'.
325
1
IpAddressPair Config::getIpAddressPairWithPort(uint16_t port, const IpAddressPair& addresses) {
326
1
  auto address_pair = IpAddressPair();
327

            
328
1
  if (addresses.ipv6_) {
329
1
    sockaddr_in6 sa6 = *reinterpret_cast<const sockaddr_in6*>(addresses.ipv6_->sockAddr());
330
1
    sa6.sin6_port = htons(port);
331
1
    address_pair.ipv6_ = std::make_shared<Network::Address::Ipv6Instance>(sa6);
332
1
  }
333
1
  if (addresses.ipv4_) {
334
1
    sockaddr_in sa4 = *reinterpret_cast<const sockaddr_in*>(addresses.ipv4_->sockAddr());
335
1
    sa4.sin_port = htons(port);
336
1
    address_pair.ipv4_ = std::make_shared<Network::Address::Ipv4Instance>(&sa4);
337
1
  }
338

            
339
1
  return address_pair;
340
1
}
341

            
342
const Network::Address::Ip* Config::selectIpVersion(const Network::Address::IpVersion version,
343
5
                                                    const IpAddressPair& source_addresses) {
344
5
  switch (version) {
345
5
  case Network::Address::IpVersion::v4:
346
5
    if (source_addresses.ipv4_) {
347
5
      return source_addresses.ipv4_->ip();
348
5
    }
349
    break;
350
  case Network::Address::IpVersion::v6:
351
    if (source_addresses.ipv6_) {
352
      return source_addresses.ipv6_->ip();
353
    }
354
    break;
355
5
  }
356

            
357
  return nullptr;
358
5
}
359

            
360
357
const PolicyInstance& Config::getPolicy(const std::string& pod_ip) const {
361
  // Allow all traffic for egress without a policy when 'is_l7lb_' is true,
362
  // or if configured without bpf (npmap_ == nullptr).
363
  // This is the case for L7 LB listeners only. This is needed to allow traffic forwarded by Cilium
364
  // Ingress (which is implemented as an egress listener!).
365
357
  bool allow_egress = !enforce_policy_on_l7lb_ && !is_ingress_ && is_l7lb_;
366
357
  if (npmap_ == nullptr) {
367
    return allow_egress ? NetworkPolicyMap::getAllowAllEgressPolicy()
368
                        : NetworkPolicyMap::getDenyAllPolicy();
369
  }
370

            
371
357
  return npmap_->getPolicyInstance(pod_ip, allow_egress);
372
357
}
373

            
374
bool Config::exists(const std::string& pod_ip) const { return npmap_->exists(pod_ip); }
375

            
376
absl::optional<Cilium::BpfMetadata::SocketMetadata>
377
10
Config::extractSocketMetadata(Network::ConnectionSocket& socket) {
378
10
  Network::Address::InstanceConstSharedPtr src_address =
379
10
      socket.connectionInfoProvider().remoteAddress();
380
10
  const auto sip = src_address->ip();
381
10
  const auto dst_address = THROW_OR_RETURN_VALUE(socket.ioHandle().localAddress(),
382
10
                                                 Network::Address::InstanceConstSharedPtr);
383
10
  const auto dip = dst_address->ip();
384
10
  auto sni = socket.requestedServerName();
385

            
386
10
  if (!sip || !dip) {
387
    ENVOY_LOG(debug, "Non-IP addresses: src: {} dst: {}", src_address->asString(),
388
              dst_address->asString());
389
    return absl::nullopt;
390
  }
391

            
392
10
  std::string pod_ip, other_ip, ingress_policy_name;
393
10
  if (is_ingress_) {
394
    pod_ip = dip->addressAsString();
395
    other_ip = sip->addressAsString();
396
    ENVOY_LOG(debug, "INGRESS POD IP: {}, source IP: {}, sni: \"{}\"", pod_ip, other_ip, sni);
397
10
  } else {
398
10
    pod_ip = sip->addressAsString();
399
10
    other_ip = dip->addressAsString();
400
10
    ENVOY_LOG(debug, "EGRESS POD IP: {}, destination IP: {} sni: \"{}\"", pod_ip, other_ip, sni);
401
10
  }
402

            
403
  // Load the policy for the Pod that sends or receives traffic.
404
  // Might change later on for North/South L7LB traffic.
405
  // Use a pointer as we may need to change the policy in the case of "North/South L7 LB" below.
406
10
  const auto* policy = &getPolicy(pod_ip);
407

            
408
  // Resolve the source security ID from conntrack map, or from ip cache
409
10
  uint32_t source_identity = resolveSourceIdentity(*policy, sip, dip, is_ingress_, is_l7lb_);
410

            
411
  // Resolve the destination security ID for egress traffic
412
10
  uint32_t destination_identity = is_ingress_ ? 0 : resolvePolicyId(dip);
413

            
414
  // ingress_source_identity is non-zero when the egress path l7 LB should also enforce
415
  // the ingress path policy using the original source identity.
416
10
  uint32_t ingress_source_identity = 0;
417

            
418
  // Use the configured IPv4/IPv6 Ingress IPs as starting point for the sources addresses
419
10
  IpAddressPair source_addresses(ipv4_source_address_, ipv6_source_address_);
420

            
421
  // NOTE: As L7 LB does not use the original destination, there is a possibility of a 5-tuple
422
  // collision if the same source pod is communicating with the same backends on same destination
423
  // port directly, maybe via some other, non-L7 LB service. We keep the original source port number
424
  // to not allocate random source ports for the source pod in the host networking namespace that
425
  // could then blackhole existing connections between the source pod and the backend. This means
426
  // that the L7 LB backend connection may fail in case of a 5-tuple collision that the host
427
  // networking namespace is aware of.
428

            
429
10
  if (is_l7lb_ && use_original_source_address_ /* East/West L7LB */) {
430
    // In case of east/west, L7 LB is only used for egress, so the local
431
    // endpoint is the source, and the other node is the destination.
432
2
    if (policy->getEndpointID() == 0) {
433
      // Local pod not found. Original source address can only be used for local pods.
434
1
      ENVOY_LOG(warn,
435
1
                "cilium.bpf_metadata (east/west L7 LB): Non-local pod can not use original "
436
1
                "source address: {}",
437
1
                pod_ip);
438
1
      return absl::nullopt;
439
1
    }
440
    // Use original source address with L7 LB for local endpoint sources if requested, as policy
441
    // enforcement after the proxy depends on it (i.e., for "east/west" LB).
442
1
    source_addresses =
443
1
        getIpAddressPairWithPort(src_address->ip()->port(), policy->getEndpointIPs());
444
8
  } else if (is_l7lb_ && !use_original_source_address_ /* North/South L7 LB */) {
445
    // North/south L7 LB, assume the source security identity of the configured source addresses,
446
    // if any and policy for this identity exists.
447

            
448
    // Pick the local ingress source address of the same family as the incoming connection
449
5
    const Network::Address::Ip* ingress_ip = selectIpVersion(sip->version(), source_addresses);
450

            
451
5
    if (!ingress_ip) {
452
      // IP family of the connection has no configured local ingress source address
453
      ENVOY_LOG(
454
          warn,
455
          "cilium.bpf_metadata (north/south L7 LB): No local Ingress IP source address configured "
456
          "for the family of {}",
457
          sip->addressAsString());
458
      return absl::nullopt;
459
    }
460

            
461
    // Enforce pod policy only for local pods.
462
5
    if (policy->getEndpointID() == 0) {
463
3
      pod_ip = ""; // source is not a local pod
464
3
    }
465

            
466
    // Enforce Ingress policy?
467
5
    if (enforce_policy_on_l7lb_) {
468
3
      ingress_source_identity = source_identity;
469
3
      ingress_policy_name =
470
3
          l7lb_policy_name_.empty() ? ingress_ip->addressAsString() : l7lb_policy_name_;
471
3
    }
472

            
473
    // Resolve source identity for the Ingress address
474
5
    source_identity = resolvePolicyId(ingress_ip);
475
5
    if (source_identity == Cilium::ID::WORLD) {
476
      // No security ID available for the configured source IP
477
      ENVOY_LOG(warn,
478
                "cilium.bpf_metadata (north/south L7 LB): Unknown local Ingress IP source address "
479
                "configured: {}",
480
                ingress_ip->addressAsString());
481
      return absl::nullopt;
482
    }
483

            
484
    // Original source address is never used for north/south LB
485
5
    src_address = nullptr;
486
5
  } else if (!use_original_source_address_ || (npmap_ != nullptr && npmap_->exists(other_ip))) {
487
    // Otherwise only use the original source address if permitted and the destination is not
488
    // in the same node.
489
    //
490
    // If bpf root is not configured (npmap_ == nullptr) we assume all destinations are non-local!
491
    //
492
    // Original source address is not used
493
3
    src_address = nullptr;
494
3
  }
495

            
496
  // Evaluating proxylib L7 protocol for later usage in filter chain matching.
497
  // This requires the TLS inspector, if used, to run before us.
498
  // Note: This requires egress policy be known before upstream host selection,
499
  // so this feature only works with the original destination cluster.
500
  // This means that L7 LB does not work with the experimental Envoy Metadata
501
  // based policies (e.g., with MongoDB or MySQL filters).
502
9
  std::string proxylib_l7proto;
503
9
  uint32_t remote_id = is_ingress_ ? source_identity : destination_identity;
504
9
  if (policy->useProxylib(is_ingress_, proxy_id_, remote_id, dip->port(), proxylib_l7proto)) {
505
    ENVOY_LOG(trace, "cilium.bpf_metadata: detected proxylib l7 proto: {}", proxylib_l7proto);
506
  }
507

            
508
  // Pass the metadata to an Envoy socket option we can retrieve later in other
509
  // Cilium filters.
510
9
  uint32_t mark = 0;
511

            
512
9
  if (is_l7lb_ && use_original_source_address_ /* E/W L7LB */) {
513
    // Mark with source endpoint ID for east/west l7 LB. This causes the upstream packets to be
514
    // processed by the the source endpoint's policy enforcement in the datapath.
515
1
    mark = 0x0900 | policy->getEndpointID() << 16;
516
8
  } else {
517
    // Mark with source identity
518
8
    uint32_t cluster_id = (source_identity >> 16) & 0xFF;
519
8
    uint32_t identity_id = (source_identity & 0xFFFF) << 16;
520
8
    mark = ((is_ingress_) ? 0x0A00 : 0x0B00) | cluster_id | identity_id;
521
8
  }
522

            
523
9
  ENVOY_LOG(trace,
524
9
            "cilium.bpf_metadata: mark {}, ingress_source_identity {}, source_identity {}, "
525
9
            "is_ingress {}, is_l7lb_ {}, ingress_policy_name {}, port {}, pod_ip {}",
526
9
            mark, ingress_source_identity, source_identity, is_ingress_, is_l7lb_,
527
9
            ingress_policy_name, dip->port(), pod_ip);
528
9
  return {Cilium::BpfMetadata::SocketMetadata(
529
9
      mark, ingress_source_identity, source_identity, is_ingress_, is_l7lb_, dip->port(),
530
9
      std::move(pod_ip), std::move(ingress_policy_name), std::move(src_address),
531
9
      std::move(source_addresses.ipv4_), std::move(source_addresses.ipv6_), std::move(dst_address),
532
9
      shared_from_this(), proxy_id_, std::move(proxylib_l7proto), sni)};
533
10
}
534

            
535
119
Network::FilterStatus Instance::onAccept(Network::ListenerFilterCallbacks& cb) {
536
119
  Network::ConnectionSocket& socket = cb.socket();
537
119
  ENVOY_LOG(trace, "onAccept (socket={})", socket.ioHandle().fdDoNotUse());
538

            
539
119
  Network::Socket::OptionsSharedPtr socket_options =
540
119
      std::make_shared<std::vector<Network::Socket::OptionConstSharedPtr>>();
541

            
542
  // Cilium socket option is not set if this fails, which causes 500 response from our l7policy
543
  // filter. Our integration tests depend on this.
544
119
  auto socket_metadata = config_->extractSocketMetadata(socket);
545
119
  if (socket_metadata) {
546

            
547
    // Setting proxy lib application protocol on downstream socket
548
119
    socket_metadata->configureProxyLibApplicationProtocol(socket);
549

            
550
    // Restoring original destination address on downstream socket
551
119
    socket_metadata->configureOriginalDstAddress(socket);
552

            
553
    // Make Cilium Policy data available to filters and upstream connection (Cilium TLS Wrapper) as
554
    // filter state.
555
119
    const auto policy_fs = socket_metadata->buildCiliumPolicyFilterState();
556
119
    cb.filterState().setData(
557
119
        Cilium::CiliumPolicyFilterState::key(), policy_fs,
558
119
        StreamInfo::FilterState::StateType::ReadOnly, StreamInfo::FilterState::LifeSpan::Connection,
559
119
        StreamInfo::StreamSharingMayImpactPooling::SharedWithUpstreamConnection);
560

            
561
119
    const auto dest_fs = socket_metadata->buildCiliumDestinationFilterState();
562
119
    cb.filterState().setData(
563
119
        Cilium::CiliumDestinationFilterState::key(), dest_fs,
564
119
        StreamInfo::FilterState::StateType::Mutable, StreamInfo::FilterState::LifeSpan::Connection,
565
119
        StreamInfo::StreamSharingMayImpactPooling::SharedWithUpstreamConnection);
566

            
567
    // Restoring original source address on the upstream socket
568
119
    socket_options->push_back(
569
119
        socket_metadata->buildSourceAddressSocketOption(config_->so_linger_, dest_fs, policy_fs));
570

            
571
119
    if (config_->addPrivilegedSocketOptions()) {
572
      // adding SO_MARK (Cilium mark) on the upstream socket
573
      socket_options->push_back(socket_metadata->buildCiliumMarkSocketOption());
574
    }
575
119
  }
576

            
577
119
  if (config_->addPrivilegedSocketOptions()) {
578
    // Setting IP_TRANSPARENT on upstream socket to be able to restore original source address
579
    socket_options->push_back(std::make_shared<Envoy::Cilium::IpTransparentSocketOption>());
580
  }
581

            
582
  // allow reuse of the original source address by setting SO_REUSEADDR.
583
  // This may by needed for retries to not fail on "address already in use"
584
  // when using a specific source address and port.
585
119
  socket_options->push_back(std::make_shared<Envoy::Network::SocketOptionImpl>(
586
119
      envoy::config::core::v3::SocketOption::STATE_PREBIND,
587
119
      Envoy::Network::SocketOptionName(SOL_SOCKET, SO_REUSEADDR, "SO_REUSEADDR"), 1));
588

            
589
  // reuse port for forwarded client connections (SO_REUSEPORT)
590
119
  Network::Socket::appendOptions(socket_options,
591
119
                                 Network::SocketOptionFactory::buildReusePortOptions());
592

            
593
  // Adding SocketOptions to the downstream socket. The function `setOption` is NOT executed
594
  // on the downstream socket itself - it's executed later on the corresponding upstream socket!
595
119
  socket.addOptions(socket_options);
596

            
597
  // set keep alive socket options on accepted connection socket
598
  // (SO_KEEPALIVE, TCP_KEEPINTVL, TCP_KEEPIDLE)
599
119
  int keepalive = true;
600
119
  int secs = 5 * 60; // Five minutes
601

            
602
119
  auto status = socket.setSocketOption(SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive));
603
119
  if (status.return_value_ < 0) {
604
    ENVOY_LOG(critical, "Socket option failure. Failed to set SO_KEEPALIVE: {}",
605
              Envoy::errorDetails(status.errno_));
606
    return Network::FilterStatus::StopIteration;
607
  }
608

            
609
119
  status = socket.setSocketOption(IPPROTO_TCP, TCP_KEEPINTVL, &secs, sizeof(secs));
610
119
  if (status.return_value_ < 0) {
611
    ENVOY_LOG(critical, "Socket option failure. Failed to set TCP_KEEPINTVL: {}",
612
              Envoy::errorDetails(status.errno_));
613
    return Network::FilterStatus::StopIteration;
614
  }
615

            
616
119
  status = socket.setSocketOption(IPPROTO_TCP, TCP_KEEPIDLE, &secs, sizeof(secs));
617
119
  if (status.return_value_ < 0) {
618
    ENVOY_LOG(critical, "Socket option failure. Failed to set TCP_KEEPIDLE: {}",
619
              Envoy::errorDetails(status.errno_));
620
    return Network::FilterStatus::StopIteration;
621
  }
622

            
623
119
  return Network::FilterStatus::Continue;
624
119
}
625

            
626
Network::FilterStatus Instance::onData(Network::ListenerFilterBuffer&) {
627
  return Network::FilterStatus::Continue;
628
};
629

            
630
size_t Instance::maxReadBytes() const { return 0; }
631

            
632
Network::FilterStatus UdpInstance::onData([[maybe_unused]] Network::UdpRecvData& data) {
633
  return Network::FilterStatus::Continue;
634
}
635

            
636
Network::FilterStatus
637
UdpInstance::onReceiveError([[maybe_unused]] Api::IoError::IoErrorCode error_code) {
638
  return Network::FilterStatus::Continue;
639
}
640

            
641
} // namespace BpfMetadata
642
} // namespace Cilium
643
} // namespace Envoy