Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dask/sizeof.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3import itertools
4import logging
5import random
6import sys
7from array import array
9from packaging.version import Version
11from dask._compatibility import importlib_metadata
12from dask.utils import Dispatch
14sizeof = Dispatch(name="sizeof")
16logger = logging.getLogger(__name__)
19@sizeof.register(object)
20def sizeof_default(o):
21 return sys.getsizeof(o)
24@sizeof.register(bytes)
25@sizeof.register(bytearray)
26def sizeof_bytes(o):
27 return len(o)
30@sizeof.register(memoryview)
31def sizeof_memoryview(o):
32 return o.nbytes
35@sizeof.register(array)
36def sizeof_array(o):
37 return o.itemsize * len(o)
40@sizeof.register(list)
41@sizeof.register(tuple)
42@sizeof.register(set)
43@sizeof.register(frozenset)
44def sizeof_python_collection(seq):
45 num_items = len(seq)
46 num_samples = 10
47 if num_items > num_samples:
48 if isinstance(seq, (set, frozenset)):
49 # As of Python v3.9, it is deprecated to call random.sample() on
50 # sets but since sets are unordered anyways we can simply pick
51 # the first `num_samples` items.
52 samples = itertools.islice(seq, num_samples)
53 else:
54 samples = random.sample(seq, num_samples)
55 return sys.getsizeof(seq) + int(
56 num_items / num_samples * sum(map(sizeof, samples))
57 )
58 else:
59 return sys.getsizeof(seq) + sum(map(sizeof, seq))
62class SimpleSizeof:
63 """Sentinel class to mark a class to be skipped by the dispatcher. This only
64 works if this sentinel mixin is first in the mro.
66 Examples
67 --------
68 >>> def _get_gc_overhead():
69 ... class _CustomObject:
70 ... def __sizeof__(self):
71 ... return 0
72 ...
73 ... return sys.getsizeof(_CustomObject())
75 >>> class TheAnswer(SimpleSizeof):
76 ... def __sizeof__(self):
77 ... # Sizeof always add overhead of an object for GC
78 ... return 42 - _get_gc_overhead()
80 >>> sizeof(TheAnswer())
81 42
83 """
86@sizeof.register(SimpleSizeof)
87def sizeof_blocked(d):
88 return sys.getsizeof(d)
91@sizeof.register(dict)
92def sizeof_python_dict(d):
93 return (
94 sys.getsizeof(d)
95 + sizeof(list(d.keys()))
96 + sizeof(list(d.values()))
97 - 2 * sizeof(list())
98 )
101@sizeof.register_lazy("cupy")
102def register_cupy():
103 import cupy
105 @sizeof.register(cupy.ndarray)
106 def sizeof_cupy_ndarray(x):
107 return int(x.nbytes)
110@sizeof.register_lazy("numba")
111def register_numba():
112 import numba.cuda
114 @sizeof.register(numba.cuda.cudadrv.devicearray.DeviceNDArray)
115 def sizeof_numba_devicendarray(x):
116 return int(x.nbytes)
119@sizeof.register_lazy("rmm")
120def register_rmm():
121 import rmm
123 # Only included in 0.11.0+
124 if hasattr(rmm, "DeviceBuffer"):
126 @sizeof.register(rmm.DeviceBuffer)
127 def sizeof_rmm_devicebuffer(x):
128 return int(x.nbytes)
131@sizeof.register_lazy("numpy")
132def register_numpy():
133 import numpy as np
135 @sizeof.register(np.ndarray)
136 def sizeof_numpy_ndarray(x):
137 if 0 in x.strides:
138 xs = x[tuple(slice(None) if s != 0 else slice(1) for s in x.strides)]
139 return xs.nbytes
140 return int(x.nbytes)
143@sizeof.register_lazy("pandas")
144def register_pandas():
145 import numpy as np
146 import pandas as pd
148 OBJECT_DTYPES = (object, pd.StringDtype("python"))
150 def object_size(*xs):
151 if not xs:
152 return 0
153 ncells = sum(len(x) for x in xs)
154 if not ncells:
155 return 0
157 # Deduplicate Series of references to the same objects,
158 # e.g. as produced by read_parquet
159 unique_samples = {}
160 for x in xs:
161 sample = np.random.choice(x, size=100, replace=True)
162 for i in sample.tolist():
163 unique_samples[id(i)] = i
165 nsamples = 100 * len(xs)
166 sample_nbytes = sum(sizeof(i) for i in unique_samples.values())
167 if len(unique_samples) / nsamples > 0.5:
168 # Less than half of the references are duplicated.
169 # Assume that, if we were to analyze twice the amount of random references,
170 # we would get twice the amount of unique objects too.
171 return int(sample_nbytes * ncells / nsamples)
172 else:
173 # Assume we've already found all unique objects and that all references that
174 # we have not yet analyzed are going to point to the same data.
175 return sample_nbytes
177 @sizeof.register(pd.DataFrame)
178 def sizeof_pandas_dataframe(df):
179 p = sizeof(df.index) + sizeof(df.columns)
180 object_cols = []
181 prev_dtype = None
183 # Unlike df.items(), df._series will not duplicate multiple views of the same
184 # column e.g. df[["x", "x", "x"]]
185 for col in df._series.values():
186 if prev_dtype is None or col.dtype != prev_dtype:
187 prev_dtype = col.dtype
188 # Contiguous columns of the same dtype share the same overhead
189 p += 1200
190 p += col.memory_usage(index=False, deep=False)
191 if col.dtype in OBJECT_DTYPES:
192 object_cols.append(col._values)
194 # Deduplicate references to the same objects appearing in different Series
195 p += object_size(*object_cols)
197 return max(1200, p)
199 @sizeof.register(pd.Series)
200 def sizeof_pandas_series(s):
201 # https://github.com/dask/dask/pull/9776#issuecomment-1359085962
202 p = 1200 + sizeof(s.index) + s.memory_usage(index=False, deep=False)
203 if s.dtype in OBJECT_DTYPES:
204 p += object_size(s._values)
205 return p
207 @sizeof.register(pd.Index)
208 def sizeof_pandas_index(i):
209 p = 400 + i.memory_usage(deep=False)
210 if i.dtype in OBJECT_DTYPES:
211 p += object_size(i)
212 return p
214 @sizeof.register(pd.MultiIndex)
215 def sizeof_pandas_multiindex(i):
216 return sum(sizeof(l) for l in i.levels) + sum(c.nbytes for c in i.codes)
219@sizeof.register_lazy("scipy")
220def register_spmatrix():
221 import scipy
222 from scipy import sparse
224 if Version(scipy.__version__) < Version("1.12.0.dev0"):
226 @sizeof.register(sparse.dok_matrix)
227 def sizeof_spmatrix_dok(s):
228 return s.__sizeof__()
230 @sizeof.register(sparse.spmatrix)
231 def sizeof_spmatrix(s):
232 return sum(sizeof(v) for v in s.__dict__.values())
235@sizeof.register_lazy("pyarrow")
236def register_pyarrow():
237 import pyarrow as pa
239 def _get_col_size(data):
240 p = 0
241 if not isinstance(data, pa.ChunkedArray):
242 data = data.data # pyarrow <0.15.0
243 for chunk in data.iterchunks():
244 for buffer in chunk.buffers():
245 if buffer:
246 p += buffer.size
247 return p
249 @sizeof.register(pa.Table)
250 def sizeof_pyarrow_table(table):
251 p = sizeof(table.schema.metadata)
252 for col in table.itercolumns():
253 p += _get_col_size(col)
254 return int(p) + 1000
256 @sizeof.register(pa.ChunkedArray)
257 def sizeof_pyarrow_chunked_array(data):
258 return int(_get_col_size(data)) + 1000
261@sizeof.register_lazy("xarray")
262def register_xarray():
263 import sys
265 import xarray as xr
267 XARRAY_VERSION = Version(xr.__version__)
268 XARRAY_GE_2024_02 = XARRAY_VERSION >= Version("2024.02.0")
270 @sizeof.register(xr.core.utils.Frozen)
271 def xarray_sizeof_frozen(obj):
272 return sys.getsizeof(obj) + sizeof(obj.mapping)
274 @sizeof.register(xr.DataArray)
275 @sizeof.register(xr.Variable)
276 def xarray_sizeof_da(obj):
277 return sys.getsizeof(obj) + sizeof(obj.data)
279 @sizeof.register(xr.Dataset)
280 def xarray_sizeof_ds(obj):
281 return sys.getsizeof(obj) + sizeof(obj.variables)
283 if XARRAY_GE_2024_02:
284 xarray_sizeof_da = sizeof.register(xr.NamedArray)(xarray_sizeof_da)
286 @sizeof.register(xr.core.indexes.Indexes)
287 def xarray_sizeof_indexes(obj):
288 return (
289 sys.getsizeof(obj)
290 + sizeof(obj._index_type)
291 + sizeof(obj._indexes)
292 + sizeof(obj._variables)
293 + sizeof(obj._dims)
294 )
296 @sizeof.register(xr.core.indexes.PandasIndex)
297 def xarray_sizeof_pd_index(obj):
298 return (
299 sys.getsizeof(obj)
300 + sizeof(obj.index)
301 + sizeof(obj.dim)
302 + sizeof(obj.coord_dtype)
303 )
306def _register_entry_point_plugins():
307 """Register sizeof implementations exposed by the entry_point mechanism."""
308 for entry_point in importlib_metadata.entry_points(group="dask.sizeof"):
309 registrar = entry_point.load()
310 try:
311 registrar(sizeof)
312 except Exception:
313 logger.exception(
314 f"Failed to register sizeof entry point {entry_point.name}"
315 )
318_register_entry_point_plugins()