Coverage Report

Created: 2026-03-10 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/arrow/cpp/src/parquet/geospatial/util_internal.cc
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "parquet/geospatial/util_internal.h"
19
20
#include <sstream>
21
22
#include "arrow/util/endian.h"
23
#include "arrow/util/macros.h"
24
#include "arrow/util/ubsan.h"
25
#include "parquet/exception.h"
26
27
namespace parquet::geospatial {
28
29
0
std::string BoundingBox::ToString() const {
30
0
  std::stringstream ss;
31
0
  ss << "BoundingBox" << std::endl;
32
0
  ss << "  x: [" << min[0] << ", " << max[0] << "]" << std::endl;
33
0
  ss << "  y: [" << min[1] << ", " << max[1] << "]" << std::endl;
34
0
  ss << "  z: [" << min[2] << ", " << max[2] << "]" << std::endl;
35
0
  ss << "  m: [" << min[3] << ", " << max[3] << "]" << std::endl;
36
37
0
  return ss.str();
38
0
}
39
40
/// \brief Object to keep track of the low-level consumption of a well-known binary
41
/// geometry
42
///
43
/// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte
44
/// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t),
45
/// followed by geometry-specific data. Coordinate sequences are represented by a
46
/// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates
47
/// multiplied by the number of dimensions).
48
class WKBBuffer {
49
 public:
50
0
  WKBBuffer() : data_(nullptr), size_(0) {}
51
0
  WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
52
53
0
  uint8_t ReadUInt8() { return ReadChecked<uint8_t>(); }
54
55
0
  uint32_t ReadUInt32(bool swap) {
56
0
    auto value = ReadChecked<uint32_t>();
57
0
    if (swap) {
58
0
      return ::arrow::bit_util::ByteSwap(value);
59
0
    } else {
60
0
      return value;
61
0
    }
62
0
  }
63
64
  template <typename Coord, typename Visit>
65
0
  void ReadCoords(uint32_t n_coords, bool swap, Visit&& visit) {
66
0
    size_t total_bytes = n_coords * sizeof(Coord);
67
0
    if (size_ < total_bytes) {
68
0
      throw ParquetException("Can't read coordinate sequence of ", total_bytes,
69
0
                             " bytes from WKBBuffer with ", size_, " remaining");
70
0
    }
71
72
0
    if (swap) {
73
0
      Coord coord;
74
0
      for (uint32_t i = 0; i < n_coords; i++) {
75
0
        coord = ReadUnchecked<Coord>();
76
0
        for (auto& c : coord) {
77
0
          c = ::arrow::bit_util::ByteSwap(c);
78
0
        }
79
80
0
        visit(coord);
81
0
      }
82
0
    } else {
83
0
      for (uint32_t i = 0; i < n_coords; i++) {
84
0
        visit(ReadUnchecked<Coord>());
85
0
      }
86
0
    }
87
0
  }
Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 2ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_0>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_0&&)
Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 3ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_1>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_1&&)
Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 3ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_2>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_2&&)
Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 4ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_3>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_3&&)
88
89
0
  size_t size() { return size_; }
90
91
 private:
92
  const uint8_t* data_;
93
  size_t size_;
94
95
  template <typename T>
96
0
  T ReadChecked() {
97
0
    if (ARROW_PREDICT_FALSE(size_ < sizeof(T))) {
98
0
      throw ParquetException("Can't read ", sizeof(T), " bytes from WKBBuffer with ",
99
0
                             size_, " remaining");
100
0
    }
101
102
0
    return ReadUnchecked<T>();
103
0
  }
Unexecuted instantiation: unsigned char parquet::geospatial::WKBBuffer::ReadChecked<unsigned char>()
Unexecuted instantiation: unsigned int parquet::geospatial::WKBBuffer::ReadChecked<unsigned int>()
104
105
  template <typename T>
106
0
  T ReadUnchecked() {
107
0
    T out = ::arrow::util::SafeLoadAs<T>(data_);
108
0
    data_ += sizeof(T);
109
0
    size_ -= sizeof(T);
110
0
    return out;
111
0
  }
Unexecuted instantiation: unsigned char parquet::geospatial::WKBBuffer::ReadUnchecked<unsigned char>()
Unexecuted instantiation: unsigned int parquet::geospatial::WKBBuffer::ReadUnchecked<unsigned int>()
Unexecuted instantiation: std::__1::array<double, 2ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 2ul> >()
Unexecuted instantiation: std::__1::array<double, 3ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 3ul> >()
Unexecuted instantiation: std::__1::array<double, 4ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 4ul> >()
112
};
113
114
using GeometryTypeAndDimensions = std::pair<GeometryType, Dimensions>;
115
116
namespace {
117
118
0
GeometryTypeAndDimensions ParseGeometryType(uint32_t wkb_geometry_type) {
119
  // The number 1000 can be used because WKB geometry types are constructed
120
  // on purpose such that this relationship is true (e.g., LINESTRING ZM maps
121
  // to 3002).
122
0
  uint32_t geometry_type_component = wkb_geometry_type % 1000;
123
0
  uint32_t dimensions_component = wkb_geometry_type / 1000;
124
125
0
  auto min_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMin);
126
0
  auto max_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMax);
127
0
  auto min_dimension_value = static_cast<uint32_t>(Dimensions::kValueMin);
128
0
  auto max_dimension_value = static_cast<uint32_t>(Dimensions::kValueMax);
129
130
0
  if (geometry_type_component < min_geometry_type_value ||
131
0
      geometry_type_component > max_geometry_type_value ||
132
0
      dimensions_component < min_dimension_value ||
133
0
      dimensions_component > max_dimension_value) {
134
0
    throw ParquetException("Invalid WKB geometry type: ", wkb_geometry_type);
135
0
  }
136
137
0
  return {static_cast<GeometryType>(geometry_type_component),
138
0
          static_cast<Dimensions>(dimensions_component)};
139
0
}
140
141
}  // namespace
142
143
0
std::vector<int32_t> WKBGeometryBounder::GeometryTypes() const {
144
0
  std::vector<int32_t> out(geospatial_types_.begin(), geospatial_types_.end());
145
0
  std::sort(out.begin(), out.end());
146
0
  return out;
147
0
}
148
149
0
void WKBGeometryBounder::MergeGeometry(std::string_view bytes_wkb) {
150
0
  MergeGeometry(::arrow::util::span(reinterpret_cast<const uint8_t*>(bytes_wkb.data()),
151
0
                                    bytes_wkb.size()));
152
0
}
153
154
0
void WKBGeometryBounder::MergeGeometry(::arrow::util::span<const uint8_t> bytes_wkb) {
155
0
  WKBBuffer src{bytes_wkb.data(), static_cast<int64_t>(bytes_wkb.size())};
156
0
  MergeGeometryInternal(&src, /*record_wkb_type=*/true);
157
0
  if (src.size() != 0) {
158
0
    throw ParquetException("Exepcted zero bytes after consuming WKB but got ",
159
0
                           src.size());
160
0
  }
161
0
}
162
163
0
void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool record_wkb_type) {
164
0
  uint8_t endian = src->ReadUInt8();
165
0
#if ARROW_LITTLE_ENDIAN
166
0
  bool swap = endian != 0x01;
167
#else
168
  bool swap = endian != 0x00;
169
#endif
170
171
0
  uint32_t wkb_geometry_type = src->ReadUInt32(swap);
172
0
  auto geometry_type_and_dimensions = ParseGeometryType(wkb_geometry_type);
173
0
  auto [geometry_type, dimensions] = geometry_type_and_dimensions;
174
175
  // Keep track of geometry types encountered if at the top level
176
0
  if (record_wkb_type) {
177
0
    geospatial_types_.insert(static_cast<int32_t>(wkb_geometry_type));
178
0
  }
179
180
0
  switch (geometry_type) {
181
0
    case GeometryType::kPoint:
182
0
      MergeSequence(src, dimensions, 1, swap);
183
0
      break;
184
185
0
    case GeometryType::kLinestring: {
186
0
      uint32_t n_coords = src->ReadUInt32(swap);
187
0
      MergeSequence(src, dimensions, n_coords, swap);
188
0
      break;
189
0
    }
190
0
    case GeometryType::kPolygon: {
191
0
      uint32_t n_parts = src->ReadUInt32(swap);
192
0
      for (uint32_t i = 0; i < n_parts; i++) {
193
0
        uint32_t n_coords = src->ReadUInt32(swap);
194
0
        MergeSequence(src, dimensions, n_coords, swap);
195
0
      }
196
0
      break;
197
0
    }
198
199
    // These are all encoded the same in WKB, even though this encoding would
200
    // allow for parts to be of a different geometry type or different dimensions.
201
    // For the purposes of bounding, this does not cause us problems. We pass
202
    // record_wkb_type = false because we do not want the child geometry to be
203
    // added to the geometry_types list (e.g., for a MultiPoint, we only want
204
    // the code for MultiPoint to be added, not the code for Point).
205
0
    case GeometryType::kMultiPoint:
206
0
    case GeometryType::kMultiLinestring:
207
0
    case GeometryType::kMultiPolygon:
208
0
    case GeometryType::kGeometryCollection: {
209
0
      uint32_t n_parts = src->ReadUInt32(swap);
210
0
      for (uint32_t i = 0; i < n_parts; i++) {
211
0
        MergeGeometryInternal(src, /*record_wkb_type*/ false);
212
0
      }
213
0
      break;
214
0
    }
215
0
  }
216
0
}
217
218
void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions,
219
0
                                       uint32_t n_coords, bool swap) {
220
0
  switch (dimensions) {
221
0
    case Dimensions::kXY:
222
0
      src->ReadCoords<BoundingBox::XY>(
223
0
          n_coords, swap, [&](BoundingBox::XY coord) { box_.UpdateXY(coord); });
224
0
      break;
225
0
    case Dimensions::kXYZ:
226
0
      src->ReadCoords<BoundingBox::XYZ>(
227
0
          n_coords, swap, [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); });
228
0
      break;
229
0
    case Dimensions::kXYM:
230
0
      src->ReadCoords<BoundingBox::XYM>(
231
0
          n_coords, swap, [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); });
232
0
      break;
233
0
    case Dimensions::kXYZM:
234
0
      src->ReadCoords<BoundingBox::XYZM>(
235
0
          n_coords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); });
236
0
      break;
237
0
    default:
238
0
      throw ParquetException("Unknown dimensions");
239
0
  }
240
0
}
241
242
}  // namespace parquet::geospatial