1
#include "source/extensions/http/header_validators/envoy_default/path_normalizer.h"
2

            
3
#include "envoy/http/header_validator_errors.h"
4

            
5
#include "source/common/http/header_utility.h"
6
#include "source/common/http/headers.h"
7
#include "source/common/runtime/runtime_features.h"
8
#include "source/extensions/http/header_validators/envoy_default/character_tables.h"
9

            
10
#include "absl/strings/match.h"
11

            
12
namespace Envoy {
13
namespace Extensions {
14
namespace Http {
15
namespace HeaderValidators {
16
namespace EnvoyDefault {
17

            
18
using ::envoy::extensions::http::header_validators::envoy_default::v3::HeaderValidatorConfig;
19
using ::envoy::extensions::http::header_validators::envoy_default::v3::
20
    HeaderValidatorConfig_UriPathNormalizationOptions;
21
using ::Envoy::Http::HeaderUtility;
22
using ::Envoy::Http::PathNormalizerResponseCodeDetail;
23
using ::Envoy::Http::RequestHeaderMap;
24
using ::Envoy::Http::testCharInTable;
25
using ::Envoy::Http::UhvResponseCodeDetail;
26

            
27
PathNormalizer::PathNormalizer(const HeaderValidatorConfig& config,
28
                               const ConfigOverrides& config_overrides)
29
613
    : config_(config), config_overrides_(config_overrides) {}
30

            
31
PathNormalizer::DecodedOctet
32
PathNormalizer::normalizeAndDecodeOctet(std::string::iterator iter,
33
87
                                        std::string::iterator end) const {
34
  // From RFC 3986: https://datatracker.ietf.org/doc/html/rfc3986#section-2.1
35
  //
36
  // SPELLCHECKER(off)
37
  // pct-encoded = "%" HEXDIG HEXDIG
38
  //
39
  // The uppercase hexadecimal digits 'A' through 'F' are equivalent to
40
  // the lowercase digits 'a' through 'f', respectively. If two URIs
41
  // differ only in the case of hexadecimal digits used in percent-encoded
42
  // octets, they are equivalent. For consistency, URI producers and
43
  // normalizers should use uppercase hexadecimal digits for all percent-
44
  // encodings.
45
  //
46
  // Also from RFC 3986: https://datatracker.ietf.org/doc/html/rfc3986#section-2.4
47
  //
48
  // When a URI is dereferenced, the components and subcomponents significant
49
  // to the scheme-specific dereferencing process (if any) must be parsed and
50
  // separated before the percent-encoded octets within those components can
51
  // be safely decoded, as otherwise the data may be mistaken for component
52
  // delimiters. The only exception is for percent-encoded octets corresponding
53
  // to characters in the unreserved set, which can be decoded at any time.
54
  // SPELLCHECKER(on)
55

            
56
87
  if (iter == end || *iter != '%') {
57
    return {PercentDecodeResult::Invalid};
58
  }
59

            
60
87
  const bool preserve_case = config_overrides_.preserve_url_encoded_case_;
61

            
62
87
  char ch = '\0';
63
  // Normalize and decode the octet
64
222
  for (int i = 0; i < 2; ++i) {
65
158
    ++iter;
66
158
    if (iter == end) {
67
6
      return {PercentDecodeResult::Invalid};
68
6
    }
69

            
70
152
    char nibble = *iter;
71
152
    if (!isxdigit(*iter)) {
72
17
      return {PercentDecodeResult::Invalid};
73
17
    }
74

            
75
    // normalize
76
135
    nibble = nibble >= 'a' ? nibble ^ 0x20 : nibble;
77
135
    if (!preserve_case) {
78
36
      *iter = nibble;
79
36
    }
80

            
81
    // decode
82
135
    int factor = i == 0 ? 16 : 1;
83
135
    ch += factor * (nibble >= 'A' ? (nibble - 'A' + 10) : (nibble - '0'));
84
135
  }
85

            
86
64
  if (testCharInTable(kUnreservedCharTable, ch)) {
87
    // Based on RFC, only decode characters in the UNRESERVED set.
88
9
    return {PercentDecodeResult::Decoded, ch};
89
9
  }
90

            
91
55
  if (ch == '/' || ch == '\\') {
92
    // We decoded a slash character and how we handle it depends on the active configuration.
93
41
    switch (config_.uri_path_normalization_options().path_with_escaped_slashes_action()) {
94
10
    case HeaderValidatorConfig_UriPathNormalizationOptions::IMPLEMENTATION_SPECIFIC_DEFAULT:
95
10
      ABSL_FALLTHROUGH_INTENDED;
96
13
    case HeaderValidatorConfig_UriPathNormalizationOptions::KEEP_UNCHANGED:
97
      // default implementation: normalize the encoded octet and accept the path
98
13
      return {PercentDecodeResult::Normalized};
99

            
100
3
    case HeaderValidatorConfig_UriPathNormalizationOptions::REJECT_REQUEST:
101
      // Reject the entire request
102
3
      return {PercentDecodeResult::Reject};
103

            
104
10
    case HeaderValidatorConfig_UriPathNormalizationOptions::UNESCAPE_AND_FORWARD:
105
      // Decode the slash and accept the path.
106
10
      return {PercentDecodeResult::Decoded, ch};
107

            
108
15
    case HeaderValidatorConfig_UriPathNormalizationOptions::UNESCAPE_AND_REDIRECT:
109
      // Decode the slash and response with a redirect to the normalized path.
110
15
      return {PercentDecodeResult::DecodedRedirect, ch};
111

            
112
    default:
113
      // This should never occur but it's here to make the compiler happy because of the extra
114
      // values added by protobuf.
115
      ENVOY_BUG(false, "Unexpected path_with_escaped_slashes_action");
116
      break;
117
41
    }
118
41
  }
119

            
120
  // The octet is a valid encoding but it wasn't be decoded because it was outside the UNRESERVED
121
  // character set.
122
14
  return {PercentDecodeResult::Normalized};
123
55
}
124

            
125
/*
126
 * Find the start of the previous segment within the path. The start of the previous segment is the
127
 * first non-slash character that directly follows a slash. For example:
128
 *
129
 *   path = "/hello/world/..";
130
 *           ^      ^    ^-- current argument
131
 *           |      |-- start of previous segment (return value)
132
 *           |-- begin argument
133
 *
134
 * Duplicate slashes that are encountered are ignored. For example:
135
 *
136
 * path = "/parent//child////..";
137
 *                  ^       ^-- current argument
138
 *                  |-- start of previous segment
139
 *
140
 * The ``current`` argument must point to a slash character. The ``begin`` iterator must be the
141
 * start of the path and it is returned on error.
142
 */
143
std::string::iterator findStartOfPreviousSegment(std::string::iterator current,
144
13
                                                 std::string::iterator begin) {
145
13
  bool seen_segment_char = false;
146
66
  for (; current != begin; --current) {
147
57
    if (*current == '/' && seen_segment_char) {
148
4
      ++current;
149
4
      return current;
150
4
    }
151

            
152
53
    if (*current != '/' && !seen_segment_char) {
153
10
      seen_segment_char = true;
154
10
    }
155
53
  }
156

            
157
9
  if (seen_segment_char) {
158
6
    ++begin;
159
6
  }
160

            
161
9
  return begin;
162
13
}
163

            
164
PathNormalizer::PathNormalizationResult
165
82
PathNormalizer::normalizePathUri(RequestHeaderMap& header_map) const {
166
  // Parse and normalize the :path header and update it in the map. From RFC 9112,
167
  // https://www.rfc-editor.org/rfc/rfc9112.html#section-3.2:
168
  //
169
  // request-target = origin-form
170
  //                / absolute-form
171
  //                / authority-form
172
  //                / asterisk-form
173
  //
174
  // origin-form    = absolute-path [ "?" query ]
175
  // absolute-form  = absolute-URI
176
  // authority-form = uri-host ":" port
177
  // asterisk-form  = "*"
178
  //
179
  // TODO(#23887) - potentially separate path normalization into multiple independent operations.
180
82
  const auto original_path = header_map.getPathValue();
181
82
  if (original_path == "*" &&
182
82
      header_map.getMethodValue() == ::Envoy::Http::Headers::get().MethodValues.Options) {
183
    // asterisk-form, only valid for OPTIONS request
184
1
    return PathNormalizationResult::success();
185
1
  }
186

            
187
81
  if (HeaderUtility::isStandardConnectRequest(header_map)) {
188
    // The :path can only be empty for standard CONNECT methods, where the request-target is in
189
    // authority-form for HTTP/1 requests, or :path is empty for HTTP/2 requests.
190
4
    if (original_path.empty()) {
191
3
      return PathNormalizationResult::success();
192
3
    }
193
1
    return {PathNormalizationResult::Action::Reject, UhvResponseCodeDetail::get().InvalidUrl};
194
4
  }
195

            
196
77
  if (original_path.empty() || original_path.at(0) != '/') {
197
3
    return {PathNormalizationResult::Action::Reject, UhvResponseCodeDetail::get().InvalidUrl};
198
3
  }
199

            
200
  // Split the path and the query parameters / fragment component.
201
74
  auto [path_view, query] = splitPathAndQueryParams(original_path);
202
  // Make a copy of the original path and then create a readonly string_view to it. The string_view
203
  // is used for optimized sub-strings and the path is modified in place.
204
74
  std::string path{path_view.data(), path_view.length()};
205

            
206
  // Start normalizing the path.
207
74
  bool redirect = false;
208

            
209
  // Path normalization is based on RFC 3986:
210
  // https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
211
  //
212
  // SPELLCHECKER(off)
213
  // path          = path-abempty    ; begins with "/" or is empty
214
  //               / path-absolute   ; begins with "/" but not "//"
215
  //               / path-noscheme   ; begins with a non-colon segment
216
  //               / path-rootless   ; begins with a segment
217
  //               / path-empty      ; zero characters
218
  //
219
  // path-abempty  = *( "/" segment )
220
  // path-absolute = "/" [ segment-nz *( "/" segment ) ]
221
  // path-noscheme = segment-nz-nc *( "/" segment )
222
  // path-rootless = segment-nz *( "/" segment )
223
  // path-empty    = 0<pchar>
224
  // segment       = *pchar
225
  // segment-nz    = 1*pchar
226
  // segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
227
  //               ; non-zero-length segment without any colon ":"
228
  //
229
  // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
230
  // SPELLCHECKER(on)
231
74
  {
232
    // pass 1: normalize and decode percent-encoded octets
233
74
    const auto result = decodePass(path);
234
74
    if (result.action() == PathNormalizationResult::Action::Reject) {
235
6
      return result;
236
6
    }
237

            
238
68
    redirect |= result.action() == PathNormalizationResult::Action::Redirect;
239
68
  }
240

            
241
  // The `envoy.uhv.allow_non_compliant_characters_in_path` flag allows the \ (back slash)
242
  // character, which legacy path normalization was changing to / (forward slash).
243
68
  if (config_overrides_.allow_non_compliant_characters_in_path_) {
244
67
    translateBackToForwardSlashes(path);
245
67
  }
246

            
247
68
  if (!config_.uri_path_normalization_options().skip_merging_slashes()) {
248
    // pass 2: merge duplicate slashes (if configured to do so)
249
66
    const auto result = mergeSlashesPass(path);
250
66
    if (result.action() == PathNormalizationResult::Action::Reject) {
251
      return result;
252
    }
253

            
254
66
    redirect |= result.action() == PathNormalizationResult::Action::Redirect;
255
66
  }
256

            
257
68
  {
258
    // pass 3: collapse dot and dot-dot segments
259
68
    const auto result = collapseDotSegmentsPass(path);
260
68
    if (result.action() == PathNormalizationResult::Action::Reject) {
261
3
      return result;
262
3
    }
263

            
264
65
    redirect |= result.action() == PathNormalizationResult::Action::Redirect;
265
65
  }
266

            
267
  absl::string_view normalized_path{path};
268
  // Update the :path header. We need to honor the normalized path and the original query/fragment
269
  // components.
270
65
  header_map.setPath(absl::StrCat(normalized_path, query));
271

            
272
65
  if (redirect) {
273
4
    return {PathNormalizationResult::Action::Redirect,
274
4
            ::Envoy::Http::PathNormalizerResponseCodeDetail::get().RedirectNormalized};
275
4
  }
276

            
277
61
  return PathNormalizationResult::success();
278
65
}
279

            
280
67
void PathNormalizer::translateBackToForwardSlashes(std::string& path) const {
281
1016
  for (char& character : path) {
282
1016
    if (character == '\\') {
283
21
      character = '/';
284
21
    }
285
1016
  }
286
67
}
287

            
288
74
PathNormalizer::PathNormalizationResult PathNormalizer::decodePass(std::string& path) const {
289
74
  auto begin = path.begin();
290
74
  auto read = std::next(begin);
291
74
  auto write = std::next(begin);
292
74
  auto end = path.end();
293
74
  bool redirect = false;
294
74
  const bool allow_invalid_url_encoding =
295
74
      Runtime::runtimeFeatureEnabled("envoy.reloadable_features.uhv_allow_malformed_url_encoding");
296

            
297
1056
  while (read != end) {
298
988
    if (*read == '%') {
299
76
      auto decode_result = normalizeAndDecodeOctet(read, end);
300
      // TODO(#23885) - add and honor config to not reject invalid percent-encoded octets.
301
76
      switch (decode_result.result()) {
302
20
      case PercentDecodeResult::Invalid:
303
20
        if (allow_invalid_url_encoding) {
304
          // Write the % character that starts invalid URL encoded sequence and then continue
305
          // scanning from the next character.
306
16
          *write++ = *read++;
307
16
          break;
308
16
        }
309
4
        ABSL_FALLTHROUGH_INTENDED;
310
6
      case PercentDecodeResult::Reject:
311
        // Reject the request
312
6
        return {PathNormalizationResult::Action::Reject, UhvResponseCodeDetail::get().InvalidUrl};
313

            
314
23
      case PercentDecodeResult::Normalized:
315
        // Valid encoding but outside the UNRESERVED character set. The encoding was normalized to
316
        // UPPERCASE and the octet must not be decoded. Copy the normalized encoding.
317
23
        *write++ = *read++;
318
23
        *write++ = *read++;
319
23
        *write++ = *read++;
320
23
        break;
321

            
322
14
      case PercentDecodeResult::DecodedRedirect:
323
        // The encoding was properly decoded but, based on the config, the request should be
324
        // redirected to the normalized path.
325
14
        redirect = true;
326
14
        ABSL_FALLTHROUGH_INTENDED;
327
31
      case PercentDecodeResult::Decoded:
328
        // The encoding was decoded. Store the decoded octet in the last character of the percent
329
        // encoding (read[2]) so it will be processed in the next iteration. We can safely advance
330
        // 2 positions since we know that the value was correctly decoded.
331
31
        std::advance(read, 2);
332
31
        *read = decode_result.octet();
333
76
      }
334
912
    } else {
335
912
      *write++ = *read++;
336
912
    }
337
988
  }
338

            
339
68
  path.resize(std::distance(begin, write));
340
68
  if (redirect) {
341
4
    return {PathNormalizationResult::Action::Redirect,
342
4
            ::Envoy::Http::PathNormalizerResponseCodeDetail::get().RedirectNormalized};
343
4
  }
344

            
345
64
  return PathNormalizationResult::success();
346
68
}
347

            
348
66
PathNormalizer::PathNormalizationResult PathNormalizer::mergeSlashesPass(std::string& path) const {
349
66
  auto begin = path.begin();
350
66
  auto read = std::next(begin);
351
66
  auto write = std::next(begin);
352
66
  auto end = path.end();
353

            
354
1012
  while (read != end) {
355
946
    if (*read == '/') {
356
87
      char prev = *std::prev(write);
357
87
      if (prev == '/') {
358
        // Duplicate slash, merge it
359
12
        ++read;
360
75
      } else {
361
        // Not a duplicate slash
362
75
        *write++ = *read++;
363
75
      }
364
859
    } else {
365
859
      *write++ = *read++;
366
859
    }
367
946
  }
368

            
369
66
  path.resize(std::distance(begin, write));
370
66
  return PathNormalizationResult::success();
371
66
}
372

            
373
PathNormalizer::PathNormalizationResult
374
68
PathNormalizer::collapseDotSegmentsPass(std::string& path) const {
375
68
  auto begin = path.begin();
376
68
  auto read = std::next(begin);
377
68
  auto write = std::next(begin);
378
68
  auto end = path.end();
379
68
  absl::string_view path_view{path};
380

            
381
1000
  while (read != end) {
382
935
    if (*read == '.') {
383
24
      char prev = *std::prev(write);
384
24
      if (prev == '/') {
385
        // attempt to read ahead 2 characters to see if we are in a "./" or "../" segment.
386
20
        const auto dot_segment = path_view.substr(std::distance(begin, read), 3);
387
20
        if (absl::StartsWith(dot_segment, "./") || dot_segment == ".") {
388
          // This is a "/./" segment or the path is terminated by "/.", ignore it
389
5
          size_t distance = std::min<size_t>(dot_segment.size(), 2);
390
          // Advance the read iterator by 1 if the path ends with "." or 2 if the segment is "./"
391
5
          std::advance(read, distance);
392
15
        } else if (dot_segment == "../" || dot_segment == "..") {
393
          // This is a "/../" segment or the path is terminated by "/..", navigate one segment up.
394
          // Back up write 1 position to the previous slash to find the previous segment start.
395
13
          auto new_write = findStartOfPreviousSegment(std::prev(write), begin);
396
13
          if (new_write == begin) {
397
            // This is an invalid ".." segment, most likely the full path is "/..", which attempts
398
            // to go above the root.
399
3
            return {PathNormalizationResult::Action::Reject,
400
3
                    UhvResponseCodeDetail::get().InvalidUrl};
401
3
          }
402

            
403
          // Set the write position to overwrite the previous segment
404
10
          write = new_write;
405
          // Advance the read iterator by 2 if the path ends with ".." or 3 if the segment is "../"
406
10
          size_t distance = std::min<size_t>(dot_segment.size(), 3);
407
10
          std::advance(read, distance);
408
10
        } else {
409
2
          *write++ = *read++;
410
2
        }
411
20
      } else {
412
4
        *write++ = *read++;
413
4
      }
414
911
    } else {
415
911
      *write++ = *read++;
416
911
    }
417
935
  }
418

            
419
65
  path.resize(std::distance(begin, write));
420
65
  return PathNormalizationResult::success();
421
68
}
422

            
423
std::tuple<absl::string_view, absl::string_view>
424
74
PathNormalizer::splitPathAndQueryParams(absl::string_view path_and_query_params) const {
425
  // Split on the query (?) or fragment (#) delimiter, whichever one is first.
426
  // TODO(#23886) - add and honor config option for handling the path fragment component.
427
74
  auto delim = path_and_query_params.find_first_of("?#");
428
74
  if (delim == absl::string_view::npos) {
429
    // no query/fragment component
430
62
    return std::make_tuple(path_and_query_params, "");
431
62
  }
432

            
433
12
  return std::make_tuple(path_and_query_params.substr(0, delim),
434
12
                         path_and_query_params.substr(delim));
435
74
}
436

            
437
} // namespace EnvoyDefault
438
} // namespace HeaderValidators
439
} // namespace Http
440
} // namespace Extensions
441
} // namespace Envoy