Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dask/datasets.py: 25%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3import random
5from packaging.version import Version
7from dask.utils import import_required
10def timeseries(
11 start="2000-01-01",
12 end="2000-01-31",
13 freq="1s",
14 partition_freq="1D",
15 dtypes=None,
16 seed=None,
17 **kwargs,
18):
19 """Create timeseries dataframe with random data
21 Parameters
22 ----------
23 start : datetime (or datetime-like string)
24 Start of time series
25 end : datetime (or datetime-like string)
26 End of time series
27 dtypes : dict (optional)
28 Mapping of column names to types.
29 Valid types include {float, int, str, 'category'}
30 freq : string
31 String like '2s' or '1H' or '12W' for the time series frequency
32 partition_freq : string
33 String like '1M' or '2Y' to divide the dataframe into partitions
34 seed : int (optional)
35 Randomstate seed
36 kwargs:
37 Keywords to pass down to individual column creation functions.
38 Keywords should be prefixed by the column name and then an underscore.
40 Examples
41 --------
42 >>> import dask
43 >>> df = dask.datasets.timeseries()
44 >>> df.head() # doctest: +SKIP
45 timestamp id name x y
46 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633
47 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107
48 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641
49 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270
50 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278
51 >>> df = dask.datasets.timeseries(
52 ... '2000', '2010',
53 ... freq='2h', partition_freq='1D', seed=1, # data frequency
54 ... dtypes={'value': float, 'name': str, 'id': int}, # data types
55 ... id_lam=1000 # control number of items in id column
56 ... )
57 """
58 from dask.dataframe.dask_expr.datasets import timeseries
60 if dtypes is None:
61 dtypes = {"name": str, "id": int, "x": float, "y": float}
63 return timeseries(
64 start=start,
65 end=end,
66 freq=freq,
67 partition_freq=partition_freq,
68 seed=seed,
69 dtypes=dtypes,
70 **kwargs,
71 )
74def _generate_mimesis(field, schema_description, records_per_partition, seed):
75 """Generate data for a single partition of a dask bag
77 See Also
78 --------
79 _make_mimesis
80 """
81 import mimesis
82 from mimesis.schema import Field, Schema
84 field = Field(seed=seed, **field)
85 # `iterations=` kwarg moved from `Schema.create()` to `Schema.__init__()`
86 # starting with `mimesis=9`.
87 schema_kwargs, create_kwargs = {}, {}
88 if Version(mimesis.__version__) < Version("9.0.0"):
89 create_kwargs["iterations"] = 1
90 else:
91 schema_kwargs["iterations"] = 1
92 schema = Schema(schema=lambda: schema_description(field), **schema_kwargs)
93 return [schema.create(**create_kwargs)[0] for i in range(records_per_partition)]
96def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):
97 """
98 Make a Dask Bag filled with data randomly generated by the mimesis project
100 Parameters
101 ----------
102 field: dict
103 keyword arguments to pass to ``mimesis.Field``
104 schema: Callable[Field] -> dict
105 The schema to use to generate the data
106 npartitions: int
107 records_per_partition: int
108 seed: int, None
109 Seed for random data
111 Returns
112 -------
113 Dask Bag
115 See Also
116 --------
117 make_people
118 """
119 import dask.bag as db
120 from dask.base import tokenize
122 field = field or {}
124 random_state = random.Random(seed)
125 seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)]
127 name = "mimesis-" + tokenize(
128 field, schema, npartitions, records_per_partition, seed
129 )
130 dsk = {
131 (name, i): (_generate_mimesis, field, schema, records_per_partition, seed)
132 for i, seed in enumerate(seeds)
133 }
135 return db.Bag(dsk, name, npartitions)
138def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):
139 """Make a dataset of random people
141 This makes a Dask Bag with dictionary records of randomly generated people.
142 This requires the optional library ``mimesis`` to generate records.
144 Parameters
145 ----------
146 npartitions : int
147 Number of partitions
148 records_per_partition : int
149 Number of records in each partition
150 seed : int, (optional)
151 Random seed
152 locale : str
153 Language locale, like 'en', 'fr', 'zh', or 'ru'
155 Returns
156 -------
157 b: Dask Bag
158 """
159 import_required(
160 "mimesis",
161 "The mimesis module is required for this function. Try:\n"
162 " python -m pip install mimesis",
163 )
165 schema = lambda field: {
166 "age": field("random.randint", a=0, b=120),
167 "name": (field("person.name"), field("person.surname")),
168 "occupation": field("person.occupation"),
169 "telephone": field("person.telephone"),
170 "address": {"address": field("address.address"), "city": field("address.city")},
171 "credit-card": {
172 "number": field("payment.credit_card_number"),
173 "expiration-date": field("payment.credit_card_expiration_date"),
174 },
175 }
177 return _make_mimesis(
178 {"locale": locale}, schema, npartitions, records_per_partition, seed
179 )