/src/arrow/cpp/src/parquet/geospatial/util_internal.cc
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "parquet/geospatial/util_internal.h" |
19 | | |
20 | | #include <sstream> |
21 | | |
22 | | #include "arrow/util/endian.h" |
23 | | #include "arrow/util/macros.h" |
24 | | #include "arrow/util/ubsan.h" |
25 | | #include "parquet/exception.h" |
26 | | |
27 | | namespace parquet::geospatial { |
28 | | |
29 | 0 | std::string BoundingBox::ToString() const { |
30 | 0 | std::stringstream ss; |
31 | 0 | ss << "BoundingBox" << std::endl; |
32 | 0 | ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl; |
33 | 0 | ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl; |
34 | 0 | ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl; |
35 | 0 | ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl; |
36 | |
|
37 | 0 | return ss.str(); |
38 | 0 | } |
39 | | |
40 | | /// \brief Object to keep track of the low-level consumption of a well-known binary |
41 | | /// geometry |
42 | | /// |
43 | | /// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte |
44 | | /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), |
45 | | /// followed by geometry-specific data. Coordinate sequences are represented by a |
46 | | /// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates |
47 | | /// multiplied by the number of dimensions). |
48 | | class WKBBuffer { |
49 | | public: |
50 | 0 | WKBBuffer() : data_(nullptr), size_(0) {} |
51 | 0 | WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} |
52 | | |
53 | 0 | uint8_t ReadUInt8() { return ReadChecked<uint8_t>(); } |
54 | | |
55 | 0 | uint32_t ReadUInt32(bool swap) { |
56 | 0 | auto value = ReadChecked<uint32_t>(); |
57 | 0 | if (swap) { |
58 | 0 | return ::arrow::bit_util::ByteSwap(value); |
59 | 0 | } else { |
60 | 0 | return value; |
61 | 0 | } |
62 | 0 | } |
63 | | |
64 | | template <typename Coord, typename Visit> |
65 | 0 | void ReadCoords(uint32_t n_coords, bool swap, Visit&& visit) { |
66 | 0 | size_t total_bytes = n_coords * sizeof(Coord); |
67 | 0 | if (size_ < total_bytes) { |
68 | 0 | throw ParquetException("Can't read coordinate sequence of ", total_bytes, |
69 | 0 | " bytes from WKBBuffer with ", size_, " remaining"); |
70 | 0 | } |
71 | | |
72 | 0 | if (swap) { |
73 | 0 | Coord coord; |
74 | 0 | for (uint32_t i = 0; i < n_coords; i++) { |
75 | 0 | coord = ReadUnchecked<Coord>(); |
76 | 0 | for (auto& c : coord) { |
77 | 0 | c = ::arrow::bit_util::ByteSwap(c); |
78 | 0 | } |
79 | |
|
80 | 0 | visit(coord); |
81 | 0 | } |
82 | 0 | } else { |
83 | 0 | for (uint32_t i = 0; i < n_coords; i++) { |
84 | 0 | visit(ReadUnchecked<Coord>()); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 2ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_0>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_0&&) Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 3ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_1>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_1&&) Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 3ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_2>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_2&&) Unexecuted instantiation: util_internal.cc:void parquet::geospatial::WKBBuffer::ReadCoords<std::__1::array<double, 4ul>, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_3>(unsigned int, bool, parquet::geospatial::WKBGeometryBounder::MergeSequence(parquet::geospatial::WKBBuffer*, parquet::geospatial::Dimensions, unsigned int, bool)::$_3&&) |
88 | | |
89 | 0 | size_t size() { return size_; } |
90 | | |
91 | | private: |
92 | | const uint8_t* data_; |
93 | | size_t size_; |
94 | | |
95 | | template <typename T> |
96 | 0 | T ReadChecked() { |
97 | 0 | if (ARROW_PREDICT_FALSE(size_ < sizeof(T))) { |
98 | 0 | throw ParquetException("Can't read ", sizeof(T), " bytes from WKBBuffer with ", |
99 | 0 | size_, " remaining"); |
100 | 0 | } |
101 | | |
102 | 0 | return ReadUnchecked<T>(); |
103 | 0 | } Unexecuted instantiation: unsigned char parquet::geospatial::WKBBuffer::ReadChecked<unsigned char>() Unexecuted instantiation: unsigned int parquet::geospatial::WKBBuffer::ReadChecked<unsigned int>() |
104 | | |
105 | | template <typename T> |
106 | 0 | T ReadUnchecked() { |
107 | 0 | T out = ::arrow::util::SafeLoadAs<T>(data_); |
108 | 0 | data_ += sizeof(T); |
109 | 0 | size_ -= sizeof(T); |
110 | 0 | return out; |
111 | 0 | } Unexecuted instantiation: unsigned char parquet::geospatial::WKBBuffer::ReadUnchecked<unsigned char>() Unexecuted instantiation: unsigned int parquet::geospatial::WKBBuffer::ReadUnchecked<unsigned int>() Unexecuted instantiation: std::__1::array<double, 2ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 2ul> >() Unexecuted instantiation: std::__1::array<double, 3ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 3ul> >() Unexecuted instantiation: std::__1::array<double, 4ul> parquet::geospatial::WKBBuffer::ReadUnchecked<std::__1::array<double, 4ul> >() |
112 | | }; |
113 | | |
114 | | using GeometryTypeAndDimensions = std::pair<GeometryType, Dimensions>; |
115 | | |
116 | | namespace { |
117 | | |
118 | 0 | GeometryTypeAndDimensions ParseGeometryType(uint32_t wkb_geometry_type) { |
119 | | // The number 1000 can be used because WKB geometry types are constructed |
120 | | // on purpose such that this relationship is true (e.g., LINESTRING ZM maps |
121 | | // to 3002). |
122 | 0 | uint32_t geometry_type_component = wkb_geometry_type % 1000; |
123 | 0 | uint32_t dimensions_component = wkb_geometry_type / 1000; |
124 | |
|
125 | 0 | auto min_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMin); |
126 | 0 | auto max_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMax); |
127 | 0 | auto min_dimension_value = static_cast<uint32_t>(Dimensions::kValueMin); |
128 | 0 | auto max_dimension_value = static_cast<uint32_t>(Dimensions::kValueMax); |
129 | |
|
130 | 0 | if (geometry_type_component < min_geometry_type_value || |
131 | 0 | geometry_type_component > max_geometry_type_value || |
132 | 0 | dimensions_component < min_dimension_value || |
133 | 0 | dimensions_component > max_dimension_value) { |
134 | 0 | throw ParquetException("Invalid WKB geometry type: ", wkb_geometry_type); |
135 | 0 | } |
136 | | |
137 | 0 | return {static_cast<GeometryType>(geometry_type_component), |
138 | 0 | static_cast<Dimensions>(dimensions_component)}; |
139 | 0 | } |
140 | | |
141 | | } // namespace |
142 | | |
143 | 0 | std::vector<int32_t> WKBGeometryBounder::GeometryTypes() const { |
144 | 0 | std::vector<int32_t> out(geospatial_types_.begin(), geospatial_types_.end()); |
145 | 0 | std::sort(out.begin(), out.end()); |
146 | 0 | return out; |
147 | 0 | } |
148 | | |
149 | 0 | void WKBGeometryBounder::MergeGeometry(std::string_view bytes_wkb) { |
150 | 0 | MergeGeometry(::arrow::util::span(reinterpret_cast<const uint8_t*>(bytes_wkb.data()), |
151 | 0 | bytes_wkb.size())); |
152 | 0 | } |
153 | | |
154 | 0 | void WKBGeometryBounder::MergeGeometry(::arrow::util::span<const uint8_t> bytes_wkb) { |
155 | 0 | WKBBuffer src{bytes_wkb.data(), static_cast<int64_t>(bytes_wkb.size())}; |
156 | 0 | MergeGeometryInternal(&src, /*record_wkb_type=*/true); |
157 | 0 | if (src.size() != 0) { |
158 | 0 | throw ParquetException("Exepcted zero bytes after consuming WKB but got ", |
159 | 0 | src.size()); |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | 0 | void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool record_wkb_type) { |
164 | 0 | uint8_t endian = src->ReadUInt8(); |
165 | 0 | #if ARROW_LITTLE_ENDIAN |
166 | 0 | bool swap = endian != 0x01; |
167 | | #else |
168 | | bool swap = endian != 0x00; |
169 | | #endif |
170 | |
|
171 | 0 | uint32_t wkb_geometry_type = src->ReadUInt32(swap); |
172 | 0 | auto geometry_type_and_dimensions = ParseGeometryType(wkb_geometry_type); |
173 | 0 | auto [geometry_type, dimensions] = geometry_type_and_dimensions; |
174 | | |
175 | | // Keep track of geometry types encountered if at the top level |
176 | 0 | if (record_wkb_type) { |
177 | 0 | geospatial_types_.insert(static_cast<int32_t>(wkb_geometry_type)); |
178 | 0 | } |
179 | |
|
180 | 0 | switch (geometry_type) { |
181 | 0 | case GeometryType::kPoint: |
182 | 0 | MergeSequence(src, dimensions, 1, swap); |
183 | 0 | break; |
184 | | |
185 | 0 | case GeometryType::kLinestring: { |
186 | 0 | uint32_t n_coords = src->ReadUInt32(swap); |
187 | 0 | MergeSequence(src, dimensions, n_coords, swap); |
188 | 0 | break; |
189 | 0 | } |
190 | 0 | case GeometryType::kPolygon: { |
191 | 0 | uint32_t n_parts = src->ReadUInt32(swap); |
192 | 0 | for (uint32_t i = 0; i < n_parts; i++) { |
193 | 0 | uint32_t n_coords = src->ReadUInt32(swap); |
194 | 0 | MergeSequence(src, dimensions, n_coords, swap); |
195 | 0 | } |
196 | 0 | break; |
197 | 0 | } |
198 | | |
199 | | // These are all encoded the same in WKB, even though this encoding would |
200 | | // allow for parts to be of a different geometry type or different dimensions. |
201 | | // For the purposes of bounding, this does not cause us problems. We pass |
202 | | // record_wkb_type = false because we do not want the child geometry to be |
203 | | // added to the geometry_types list (e.g., for a MultiPoint, we only want |
204 | | // the code for MultiPoint to be added, not the code for Point). |
205 | 0 | case GeometryType::kMultiPoint: |
206 | 0 | case GeometryType::kMultiLinestring: |
207 | 0 | case GeometryType::kMultiPolygon: |
208 | 0 | case GeometryType::kGeometryCollection: { |
209 | 0 | uint32_t n_parts = src->ReadUInt32(swap); |
210 | 0 | for (uint32_t i = 0; i < n_parts; i++) { |
211 | 0 | MergeGeometryInternal(src, /*record_wkb_type*/ false); |
212 | 0 | } |
213 | 0 | break; |
214 | 0 | } |
215 | 0 | } |
216 | 0 | } |
217 | | |
218 | | void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions, |
219 | 0 | uint32_t n_coords, bool swap) { |
220 | 0 | switch (dimensions) { |
221 | 0 | case Dimensions::kXY: |
222 | 0 | src->ReadCoords<BoundingBox::XY>( |
223 | 0 | n_coords, swap, [&](BoundingBox::XY coord) { box_.UpdateXY(coord); }); |
224 | 0 | break; |
225 | 0 | case Dimensions::kXYZ: |
226 | 0 | src->ReadCoords<BoundingBox::XYZ>( |
227 | 0 | n_coords, swap, [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); }); |
228 | 0 | break; |
229 | 0 | case Dimensions::kXYM: |
230 | 0 | src->ReadCoords<BoundingBox::XYM>( |
231 | 0 | n_coords, swap, [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); }); |
232 | 0 | break; |
233 | 0 | case Dimensions::kXYZM: |
234 | 0 | src->ReadCoords<BoundingBox::XYZM>( |
235 | 0 | n_coords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); }); |
236 | 0 | break; |
237 | 0 | default: |
238 | 0 | throw ParquetException("Unknown dimensions"); |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | | } // namespace parquet::geospatial |