Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dask/sizeof.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

186 statements  

1from __future__ import annotations 

2 

3import itertools 

4import logging 

5import random 

6import sys 

7from array import array 

8 

9from packaging.version import Version 

10 

11from dask._compatibility import importlib_metadata 

12from dask.utils import Dispatch 

13 

14sizeof = Dispatch(name="sizeof") 

15 

16logger = logging.getLogger(__name__) 

17 

18 

19@sizeof.register(object) 

20def sizeof_default(o): 

21 return sys.getsizeof(o) 

22 

23 

24@sizeof.register(bytes) 

25@sizeof.register(bytearray) 

26def sizeof_bytes(o): 

27 return len(o) 

28 

29 

30@sizeof.register(memoryview) 

31def sizeof_memoryview(o): 

32 return o.nbytes 

33 

34 

35@sizeof.register(array) 

36def sizeof_array(o): 

37 return o.itemsize * len(o) 

38 

39 

40@sizeof.register(list) 

41@sizeof.register(tuple) 

42@sizeof.register(set) 

43@sizeof.register(frozenset) 

44def sizeof_python_collection(seq): 

45 num_items = len(seq) 

46 num_samples = 10 

47 if num_items > num_samples: 

48 if isinstance(seq, (set, frozenset)): 

49 # As of Python v3.9, it is deprecated to call random.sample() on 

50 # sets but since sets are unordered anyways we can simply pick 

51 # the first `num_samples` items. 

52 samples = itertools.islice(seq, num_samples) 

53 else: 

54 samples = random.sample(seq, num_samples) 

55 return sys.getsizeof(seq) + int( 

56 num_items / num_samples * sum(map(sizeof, samples)) 

57 ) 

58 else: 

59 return sys.getsizeof(seq) + sum(map(sizeof, seq)) 

60 

61 

62class SimpleSizeof: 

63 """Sentinel class to mark a class to be skipped by the dispatcher. This only 

64 works if this sentinel mixin is first in the mro. 

65 

66 Examples 

67 -------- 

68 >>> def _get_gc_overhead(): 

69 ... class _CustomObject: 

70 ... def __sizeof__(self): 

71 ... return 0 

72 ... 

73 ... return sys.getsizeof(_CustomObject()) 

74 

75 >>> class TheAnswer(SimpleSizeof): 

76 ... def __sizeof__(self): 

77 ... # Sizeof always add overhead of an object for GC 

78 ... return 42 - _get_gc_overhead() 

79 

80 >>> sizeof(TheAnswer()) 

81 42 

82 

83 """ 

84 

85 

86@sizeof.register(SimpleSizeof) 

87def sizeof_blocked(d): 

88 return sys.getsizeof(d) 

89 

90 

91@sizeof.register(dict) 

92def sizeof_python_dict(d): 

93 return ( 

94 sys.getsizeof(d) 

95 + sizeof(list(d.keys())) 

96 + sizeof(list(d.values())) 

97 - 2 * sizeof(list()) 

98 ) 

99 

100 

101@sizeof.register_lazy("cupy") 

102def register_cupy(): 

103 import cupy 

104 

105 @sizeof.register(cupy.ndarray) 

106 def sizeof_cupy_ndarray(x): 

107 return int(x.nbytes) 

108 

109 

110@sizeof.register_lazy("numba") 

111def register_numba(): 

112 import numba.cuda 

113 

114 @sizeof.register(numba.cuda.cudadrv.devicearray.DeviceNDArray) 

115 def sizeof_numba_devicendarray(x): 

116 return int(x.nbytes) 

117 

118 

119@sizeof.register_lazy("rmm") 

120def register_rmm(): 

121 import rmm 

122 

123 # Only included in 0.11.0+ 

124 if hasattr(rmm, "DeviceBuffer"): 

125 

126 @sizeof.register(rmm.DeviceBuffer) 

127 def sizeof_rmm_devicebuffer(x): 

128 return int(x.nbytes) 

129 

130 

131@sizeof.register_lazy("numpy") 

132def register_numpy(): 

133 import numpy as np 

134 

135 @sizeof.register(np.ndarray) 

136 def sizeof_numpy_ndarray(x): 

137 if 0 in x.strides: 

138 xs = x[tuple(slice(None) if s != 0 else slice(1) for s in x.strides)] 

139 return xs.nbytes 

140 return int(x.nbytes) 

141 

142 

143@sizeof.register_lazy("pandas") 

144def register_pandas(): 

145 import numpy as np 

146 import pandas as pd 

147 

148 OBJECT_DTYPES = (object, pd.StringDtype("python")) 

149 

150 def object_size(*xs): 

151 if not xs: 

152 return 0 

153 ncells = sum(len(x) for x in xs) 

154 if not ncells: 

155 return 0 

156 

157 # Deduplicate Series of references to the same objects, 

158 # e.g. as produced by read_parquet 

159 unique_samples = {} 

160 for x in xs: 

161 sample = np.random.choice(x, size=100, replace=True) 

162 for i in sample.tolist(): 

163 unique_samples[id(i)] = i 

164 

165 nsamples = 100 * len(xs) 

166 sample_nbytes = sum(sizeof(i) for i in unique_samples.values()) 

167 if len(unique_samples) / nsamples > 0.5: 

168 # Less than half of the references are duplicated. 

169 # Assume that, if we were to analyze twice the amount of random references, 

170 # we would get twice the amount of unique objects too. 

171 return int(sample_nbytes * ncells / nsamples) 

172 else: 

173 # Assume we've already found all unique objects and that all references that 

174 # we have not yet analyzed are going to point to the same data. 

175 return sample_nbytes 

176 

177 @sizeof.register(pd.DataFrame) 

178 def sizeof_pandas_dataframe(df): 

179 p = sizeof(df.index) + sizeof(df.columns) 

180 object_cols = [] 

181 prev_dtype = None 

182 

183 # Unlike df.items(), df._series will not duplicate multiple views of the same 

184 # column e.g. df[["x", "x", "x"]] 

185 for col in df._series.values(): 

186 if prev_dtype is None or col.dtype != prev_dtype: 

187 prev_dtype = col.dtype 

188 # Contiguous columns of the same dtype share the same overhead 

189 p += 1200 

190 p += col.memory_usage(index=False, deep=False) 

191 if col.dtype in OBJECT_DTYPES: 

192 object_cols.append(col._values) 

193 

194 # Deduplicate references to the same objects appearing in different Series 

195 p += object_size(*object_cols) 

196 

197 return max(1200, p) 

198 

199 @sizeof.register(pd.Series) 

200 def sizeof_pandas_series(s): 

201 # https://github.com/dask/dask/pull/9776#issuecomment-1359085962 

202 p = 1200 + sizeof(s.index) + s.memory_usage(index=False, deep=False) 

203 if s.dtype in OBJECT_DTYPES: 

204 p += object_size(s._values) 

205 return p 

206 

207 @sizeof.register(pd.Index) 

208 def sizeof_pandas_index(i): 

209 p = 400 + i.memory_usage(deep=False) 

210 if i.dtype in OBJECT_DTYPES: 

211 p += object_size(i) 

212 return p 

213 

214 @sizeof.register(pd.MultiIndex) 

215 def sizeof_pandas_multiindex(i): 

216 return sum(sizeof(l) for l in i.levels) + sum(c.nbytes for c in i.codes) 

217 

218 

219@sizeof.register_lazy("scipy") 

220def register_spmatrix(): 

221 import scipy 

222 from scipy import sparse 

223 

224 if Version(scipy.__version__) < Version("1.12.0.dev0"): 

225 

226 @sizeof.register(sparse.dok_matrix) 

227 def sizeof_spmatrix_dok(s): 

228 return s.__sizeof__() 

229 

230 @sizeof.register(sparse.spmatrix) 

231 def sizeof_spmatrix(s): 

232 return sum(sizeof(v) for v in s.__dict__.values()) 

233 

234 

235@sizeof.register_lazy("pyarrow") 

236def register_pyarrow(): 

237 import pyarrow as pa 

238 

239 def _get_col_size(data): 

240 p = 0 

241 if not isinstance(data, pa.ChunkedArray): 

242 data = data.data # pyarrow <0.15.0 

243 for chunk in data.iterchunks(): 

244 for buffer in chunk.buffers(): 

245 if buffer: 

246 p += buffer.size 

247 return p 

248 

249 @sizeof.register(pa.Table) 

250 def sizeof_pyarrow_table(table): 

251 p = sizeof(table.schema.metadata) 

252 for col in table.itercolumns(): 

253 p += _get_col_size(col) 

254 return int(p) + 1000 

255 

256 @sizeof.register(pa.ChunkedArray) 

257 def sizeof_pyarrow_chunked_array(data): 

258 return int(_get_col_size(data)) + 1000 

259 

260 

261@sizeof.register_lazy("xarray") 

262def register_xarray(): 

263 import sys 

264 

265 import xarray as xr 

266 

267 XARRAY_VERSION = Version(xr.__version__) 

268 XARRAY_GE_2024_02 = XARRAY_VERSION >= Version("2024.02.0") 

269 

270 @sizeof.register(xr.core.utils.Frozen) 

271 def xarray_sizeof_frozen(obj): 

272 return sys.getsizeof(obj) + sizeof(obj.mapping) 

273 

274 @sizeof.register(xr.DataArray) 

275 @sizeof.register(xr.Variable) 

276 def xarray_sizeof_da(obj): 

277 return sys.getsizeof(obj) + sizeof(obj.data) 

278 

279 @sizeof.register(xr.Dataset) 

280 def xarray_sizeof_ds(obj): 

281 return sys.getsizeof(obj) + sizeof(obj.variables) 

282 

283 if XARRAY_GE_2024_02: 

284 xarray_sizeof_da = sizeof.register(xr.NamedArray)(xarray_sizeof_da) 

285 

286 @sizeof.register(xr.core.indexes.Indexes) 

287 def xarray_sizeof_indexes(obj): 

288 return ( 

289 sys.getsizeof(obj) 

290 + sizeof(obj._index_type) 

291 + sizeof(obj._indexes) 

292 + sizeof(obj._variables) 

293 + sizeof(obj._dims) 

294 ) 

295 

296 @sizeof.register(xr.core.indexes.PandasIndex) 

297 def xarray_sizeof_pd_index(obj): 

298 return ( 

299 sys.getsizeof(obj) 

300 + sizeof(obj.index) 

301 + sizeof(obj.dim) 

302 + sizeof(obj.coord_dtype) 

303 ) 

304 

305 

306def _register_entry_point_plugins(): 

307 """Register sizeof implementations exposed by the entry_point mechanism.""" 

308 for entry_point in importlib_metadata.entry_points(group="dask.sizeof"): 

309 registrar = entry_point.load() 

310 try: 

311 registrar(sizeof) 

312 except Exception: 

313 logger.exception( 

314 f"Failed to register sizeof entry point {entry_point.name}" 

315 ) 

316 

317 

318_register_entry_point_plugins()