Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pyarrow/__init__.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

156 statements  

1# Licensed to the Apache Software Foundation (ASF) under one 

2# or more contributor license agreements. See the NOTICE file 

3# distributed with this work for additional information 

4# regarding copyright ownership. The ASF licenses this file 

5# to you under the Apache License, Version 2.0 (the 

6# "License"); you may not use this file except in compliance 

7# with the License. You may obtain a copy of the License at 

8# 

9# http://www.apache.org/licenses/LICENSE-2.0 

10# 

11# Unless required by applicable law or agreed to in writing, 

12# software distributed under the License is distributed on an 

13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 

14# KIND, either express or implied. See the License for the 

15# specific language governing permissions and limitations 

16# under the License. 

17 

18# flake8: noqa 

19 

20""" 

21PyArrow is the python implementation of Apache Arrow. 

22 

23Apache Arrow is a cross-language development platform for in-memory data. 

24It specifies a standardized language-independent columnar memory format for 

25flat and hierarchical data, organized for efficient analytic operations on 

26modern hardware. It also provides computational libraries and zero-copy 

27streaming messaging and interprocess communication. 

28 

29For more information see the official page at https://arrow.apache.org 

30""" 

31 

32import gc as _gc 

33import importlib as _importlib 

34import os as _os 

35import platform as _platform 

36import sys as _sys 

37import warnings as _warnings 

38 

39try: 

40 from ._generated_version import version as __version__ 

41except ImportError: 

42 # Package is not installed, parse git tag at runtime 

43 try: 

44 import setuptools_scm 

45 # Code duplicated from setup.py to avoid a dependency on each other 

46 

47 def parse_git(root, **kwargs): 

48 """ 

49 Parse function for setuptools_scm that ignores tags for non-C++ 

50 subprojects, e.g. apache-arrow-js-XXX tags. 

51 """ 

52 from setuptools_scm.git import parse 

53 kwargs['describe_command'] = \ 

54 "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" 

55 return parse(root, **kwargs) 

56 __version__ = setuptools_scm.get_version('../', 

57 parse=parse_git) 

58 except ImportError: 

59 __version__ = None 

60 

61# ARROW-8684: Disable GC while initializing Cython extension module, 

62# to workaround Cython bug in https://github.com/cython/cython/issues/3603 

63_gc_enabled = _gc.isenabled() 

64_gc.disable() 

65import pyarrow.lib as _lib 

66if _gc_enabled: 

67 _gc.enable() 

68 

69from pyarrow.lib import (BuildInfo, RuntimeInfo, set_timezone_db_path, 

70 MonthDayNano, VersionInfo, cpp_build_info, 

71 cpp_version, cpp_version_info, runtime_info, 

72 cpu_count, set_cpu_count, enable_signal_handlers, 

73 io_thread_count, set_io_thread_count) 

74 

75 

76def show_versions(): 

77 """ 

78 Print various version information, to help with error reporting. 

79 """ 

80 def print_entry(label, value): 

81 print(f"{label: <26}: {value: <8}") 

82 

83 print("pyarrow version info\n--------------------") 

84 print_entry("Package kind", cpp_build_info.package_kind 

85 if len(cpp_build_info.package_kind) > 0 

86 else "not indicated") 

87 print_entry("Arrow C++ library version", cpp_build_info.version) 

88 print_entry("Arrow C++ compiler", 

89 f"{cpp_build_info.compiler_id} {cpp_build_info.compiler_version}") 

90 print_entry("Arrow C++ compiler flags", cpp_build_info.compiler_flags) 

91 print_entry("Arrow C++ git revision", cpp_build_info.git_id) 

92 print_entry("Arrow C++ git description", cpp_build_info.git_description) 

93 print_entry("Arrow C++ build type", cpp_build_info.build_type) 

94 

95 

96def _module_is_available(module): 

97 try: 

98 _importlib.import_module(f'pyarrow.{module}') 

99 except ImportError: 

100 return False 

101 else: 

102 return True 

103 

104 

105def _filesystem_is_available(fs): 

106 try: 

107 import pyarrow.fs 

108 except ImportError: 

109 return False 

110 

111 try: 

112 getattr(pyarrow.fs, fs) 

113 except (ImportError, AttributeError): 

114 return False 

115 else: 

116 return True 

117 

118 

119def show_info(): 

120 """ 

121 Print detailed version and platform information, for error reporting 

122 """ 

123 show_versions() 

124 

125 def print_entry(label, value): 

126 print(f" {label: <20}: {value: <8}") 

127 

128 print("\nPlatform:") 

129 print_entry("OS / Arch", f"{_platform.system()} {_platform.machine()}") 

130 print_entry("SIMD Level", runtime_info().simd_level) 

131 print_entry("Detected SIMD Level", runtime_info().detected_simd_level) 

132 

133 pool = default_memory_pool() 

134 print("\nMemory:") 

135 print_entry("Default backend", pool.backend_name) 

136 print_entry("Bytes allocated", f"{pool.bytes_allocated()} bytes") 

137 print_entry("Max memory", f"{pool.max_memory()} bytes") 

138 print_entry("Supported Backends", ', '.join(supported_memory_backends())) 

139 

140 print("\nOptional modules:") 

141 modules = ["csv", "cuda", "dataset", "feather", "flight", "fs", "gandiva", "json", 

142 "orc", "parquet"] 

143 for module in modules: 

144 status = "Enabled" if _module_is_available(module) else "-" 

145 print(f" {module: <20}: {status: <8}") 

146 

147 print("\nFilesystems:") 

148 filesystems = ["AzureFileSystem", "GcsFileSystem", 

149 "HadoopFileSystem", "S3FileSystem"] 

150 for fs in filesystems: 

151 status = "Enabled" if _filesystem_is_available(fs) else "-" 

152 print(f" {fs: <20}: {status: <8}") 

153 

154 print("\nCompression Codecs:") 

155 codecs = ["brotli", "bz2", "gzip", "lz4_frame", "lz4", "snappy", "zstd"] 

156 for codec in codecs: 

157 status = "Enabled" if Codec.is_available(codec) else "-" 

158 print(f" {codec: <20}: {status: <8}") 

159 

160 

161from pyarrow.lib import (null, bool_, 

162 int8, int16, int32, int64, 

163 uint8, uint16, uint32, uint64, 

164 time32, time64, timestamp, date32, date64, duration, 

165 month_day_nano_interval, 

166 float16, float32, float64, 

167 binary, string, utf8, binary_view, string_view, 

168 large_binary, large_string, large_utf8, 

169 decimal32, decimal64, decimal128, decimal256, 

170 list_, large_list, list_view, large_list_view, 

171 map_, struct, 

172 union, sparse_union, dense_union, 

173 dictionary, 

174 run_end_encoded, 

175 bool8, fixed_shape_tensor, json_, opaque, uuid, 

176 field, 

177 type_for_alias, 

178 DataType, DictionaryType, StructType, 

179 ListType, LargeListType, FixedSizeListType, 

180 ListViewType, LargeListViewType, 

181 MapType, UnionType, SparseUnionType, DenseUnionType, 

182 TimestampType, Time32Type, Time64Type, DurationType, 

183 FixedSizeBinaryType, 

184 Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, 

185 BaseExtensionType, ExtensionType, 

186 RunEndEncodedType, Bool8Type, FixedShapeTensorType, 

187 JsonType, OpaqueType, UuidType, 

188 PyExtensionType, UnknownExtensionType, 

189 register_extension_type, unregister_extension_type, 

190 DictionaryMemo, 

191 KeyValueMetadata, 

192 Field, 

193 Schema, 

194 schema, 

195 unify_schemas, 

196 Array, Tensor, 

197 array, chunked_array, record_batch, nulls, repeat, 

198 SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, 

199 SparseCSFTensor, 

200 infer_type, from_numpy_dtype, 

201 NullArray, 

202 NumericArray, IntegerArray, FloatingPointArray, 

203 BooleanArray, 

204 Int8Array, UInt8Array, 

205 Int16Array, UInt16Array, 

206 Int32Array, UInt32Array, 

207 Int64Array, UInt64Array, 

208 HalfFloatArray, FloatArray, DoubleArray, 

209 ListArray, LargeListArray, FixedSizeListArray, 

210 ListViewArray, LargeListViewArray, 

211 MapArray, UnionArray, 

212 BinaryArray, StringArray, 

213 LargeBinaryArray, LargeStringArray, 

214 BinaryViewArray, StringViewArray, 

215 FixedSizeBinaryArray, 

216 DictionaryArray, 

217 Date32Array, Date64Array, TimestampArray, 

218 Time32Array, Time64Array, DurationArray, 

219 MonthDayNanoIntervalArray, 

220 Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, 

221 StructArray, ExtensionArray, 

222 RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, 

223 JsonArray, OpaqueArray, UuidArray, 

224 scalar, NA, _NULL as NULL, Scalar, 

225 NullScalar, BooleanScalar, 

226 Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, 

227 UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, 

228 HalfFloatScalar, FloatScalar, DoubleScalar, 

229 Decimal32Scalar, Decimal64Scalar, Decimal128Scalar, Decimal256Scalar, 

230 ListScalar, LargeListScalar, FixedSizeListScalar, 

231 ListViewScalar, LargeListViewScalar, 

232 Date32Scalar, Date64Scalar, 

233 Time32Scalar, Time64Scalar, 

234 TimestampScalar, DurationScalar, 

235 MonthDayNanoIntervalScalar, 

236 BinaryScalar, LargeBinaryScalar, BinaryViewScalar, 

237 StringScalar, LargeStringScalar, StringViewScalar, 

238 FixedSizeBinaryScalar, DictionaryScalar, 

239 MapScalar, StructScalar, UnionScalar, 

240 RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, 

241 FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) 

242 

243# Buffers, allocation 

244from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, 

245 default_cpu_memory_manager) 

246 

247from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, 

248 Codec, compress, decompress, allocate_buffer) 

249 

250from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, 

251 total_allocated_bytes, set_memory_pool, 

252 default_memory_pool, system_memory_pool, 

253 jemalloc_memory_pool, mimalloc_memory_pool, 

254 logging_memory_pool, proxy_memory_pool, 

255 log_memory_allocations, jemalloc_set_decay_ms, 

256 supported_memory_backends) 

257 

258# I/O 

259from pyarrow.lib import (NativeFile, PythonFile, 

260 BufferedInputStream, BufferedOutputStream, CacheOptions, 

261 CompressedInputStream, CompressedOutputStream, 

262 TransformInputStream, transcoding_input_stream, 

263 FixedSizeBufferWriter, 

264 BufferReader, BufferOutputStream, 

265 OSFile, MemoryMappedFile, memory_map, 

266 create_memory_map, MockOutputStream, 

267 input_stream, output_stream, 

268 have_libhdfs) 

269 

270from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, 

271 concat_arrays, concat_tables, TableGroupBy, 

272 RecordBatchReader, concat_batches) 

273 

274# Exceptions 

275from pyarrow.lib import (ArrowCancelled, 

276 ArrowCapacityError, 

277 ArrowException, 

278 ArrowKeyError, 

279 ArrowIndexError, 

280 ArrowInvalid, 

281 ArrowIOError, 

282 ArrowMemoryError, 

283 ArrowNotImplementedError, 

284 ArrowTypeError, 

285 ArrowSerializationError) 

286 

287from pyarrow.ipc import serialize_pandas, deserialize_pandas 

288import pyarrow.ipc as ipc 

289 

290import pyarrow.types as types 

291 

292 

293# ---------------------------------------------------------------------- 

294# Deprecations 

295 

296from pyarrow.util import _deprecate_api, _deprecate_class 

297 

298 

299# TODO: Deprecate these somehow in the pyarrow namespace 

300from pyarrow.ipc import (Message, MessageReader, MetadataVersion, 

301 RecordBatchFileReader, RecordBatchFileWriter, 

302 RecordBatchStreamReader, RecordBatchStreamWriter) 

303 

304# ---------------------------------------------------------------------- 

305# Returning absolute path to the pyarrow include directory (if bundled, e.g. in 

306# wheels) 

307 

308 

309def get_include(): 

310 """ 

311 Return absolute path to directory containing Arrow C++ include 

312 headers. Similar to numpy.get_include 

313 """ 

314 return _os.path.join(_os.path.dirname(__file__), 'include') 

315 

316 

317def _get_pkg_config_executable(): 

318 return _os.environ.get('PKG_CONFIG', 'pkg-config') 

319 

320 

321def _has_pkg_config(pkgname): 

322 import subprocess 

323 try: 

324 return subprocess.call([_get_pkg_config_executable(), 

325 '--exists', pkgname]) == 0 

326 except FileNotFoundError: 

327 return False 

328 

329 

330def _read_pkg_config_variable(pkgname, cli_args): 

331 import subprocess 

332 cmd = [_get_pkg_config_executable(), pkgname] + cli_args 

333 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 

334 stderr=subprocess.PIPE) 

335 out, err = proc.communicate() 

336 if proc.returncode != 0: 

337 raise RuntimeError("pkg-config failed: " + err.decode('utf8')) 

338 return out.rstrip().decode('utf8') 

339 

340 

341def get_libraries(): 

342 """ 

343 Return list of library names to include in the `libraries` argument for C 

344 or Cython extensions using pyarrow 

345 """ 

346 return ['arrow_python', 'arrow'] 

347 

348 

349def create_library_symlinks(): 

350 """ 

351 With Linux and macOS wheels, the bundled shared libraries have an embedded 

352 ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them 

353 with -larrow won't work unless we create symlinks at locations like 

354 site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses 

355 prior problems we had with shipping two copies of the shared libraries to 

356 permit third party projects like turbodbc to build their C++ extensions 

357 against the pyarrow wheels. 

358 

359 This function must only be invoked once and only when the shared libraries 

360 are bundled with the Python package, which should only apply to wheel-based 

361 installs. It requires write access to the site-packages/pyarrow directory 

362 and so depending on your system may need to be run with root. 

363 """ 

364 import glob 

365 if _sys.platform == 'win32': 

366 return 

367 package_cwd = _os.path.dirname(__file__) 

368 

369 if _sys.platform == 'linux': 

370 bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) 

371 

372 def get_symlink_path(hard_path): 

373 return hard_path.rsplit('.', 1)[0] 

374 else: 

375 bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) 

376 

377 def get_symlink_path(hard_path): 

378 return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib')) 

379 

380 for lib_hard_path in bundled_libs: 

381 symlink_path = get_symlink_path(lib_hard_path) 

382 if _os.path.exists(symlink_path): 

383 continue 

384 try: 

385 _os.symlink(lib_hard_path, symlink_path) 

386 except PermissionError: 

387 print("Tried creating symlink {}. If you need to link to " 

388 "bundled shared libraries, run " 

389 "pyarrow.create_library_symlinks() as root") 

390 

391 

392def get_library_dirs(): 

393 """ 

394 Return lists of directories likely to contain Arrow C++ libraries for 

395 linking C or Cython extensions using pyarrow 

396 """ 

397 package_cwd = _os.path.dirname(__file__) 

398 library_dirs = [package_cwd] 

399 

400 def append_library_dir(library_dir): 

401 if library_dir not in library_dirs: 

402 library_dirs.append(library_dir) 

403 

404 # Search library paths via pkg-config. This is necessary if the user 

405 # installed libarrow and the other shared libraries manually and they 

406 # are not shipped inside the pyarrow package (see also ARROW-2976). 

407 pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config' 

408 for pkgname in ["arrow", "arrow_python"]: 

409 if _has_pkg_config(pkgname): 

410 library_dir = _read_pkg_config_variable(pkgname, 

411 ["--libs-only-L"]) 

412 # pkg-config output could be empty if Arrow is installed 

413 # as a system package. 

414 if library_dir: 

415 if not library_dir.startswith("-L"): 

416 raise ValueError( 

417 "pkg-config --libs-only-L returned unexpected " 

418 "value {!r}".format(library_dir)) 

419 append_library_dir(library_dir[2:]) 

420 

421 if _sys.platform == 'win32': 

422 # TODO(wesm): Is this necessary, or does setuptools within a conda 

423 # installation add Library\lib to the linker path for MSVC? 

424 python_base_install = _os.path.dirname(_sys.executable) 

425 library_dir = _os.path.join(python_base_install, 'Library', 'lib') 

426 

427 if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')): 

428 append_library_dir(library_dir) 

429 

430 # ARROW-4074: Allow for ARROW_HOME to be set to some other directory 

431 if _os.environ.get('ARROW_HOME'): 

432 append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib')) 

433 else: 

434 # Python wheels bundle the Arrow libraries in the pyarrow directory. 

435 append_library_dir(_os.path.dirname(_os.path.abspath(__file__))) 

436 

437 return library_dirs