Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dask/datasets.py: 25%

1from __future__ import annotations

3import random

5from packaging.version import Version

7from dask.utils import import_required

10def timeseries(

11 start="2000-01-01",

12 end="2000-01-31",

13 freq="1s",

14 partition_freq="1D",

15 dtypes=None,

16 seed=None,

17 **kwargs,

18):

19 """Create timeseries dataframe with random data

21 Parameters

22 ----------

23 start : datetime (or datetime-like string)

24 Start of time series

25 end : datetime (or datetime-like string)

26 End of time series

27 dtypes : dict (optional)

28 Mapping of column names to types.

29 Valid types include {float, int, str, 'category'}

30 freq : string

31 String like '2s' or '1H' or '12W' for the time series frequency

32 partition_freq : string

33 String like '1M' or '2Y' to divide the dataframe into partitions

34 seed : int (optional)

35 Randomstate seed

36 kwargs:

37 Keywords to pass down to individual column creation functions.

38 Keywords should be prefixed by the column name and then an underscore.

40 Examples

41 --------

42 >>> import dask

43 >>> df = dask.datasets.timeseries()

44 >>> df.head() # doctest: +SKIP

45 timestamp id name x y

46 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633

47 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107

48 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641

49 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270

50 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278

51 >>> df = dask.datasets.timeseries(

52 ... '2000', '2010',

53 ... freq='2h', partition_freq='1D', seed=1, # data frequency

54 ... dtypes={'value': float, 'name': str, 'id': int}, # data types

55 ... id_lam=1000 # control number of items in id column

56 ... )

57 """

58 from dask.dataframe.dask_expr.datasets import timeseries

60 if dtypes is None:

61 dtypes = {"name": str, "id": int, "x": float, "y": float}

63 return timeseries(

64 start=start,

65 end=end,

66 freq=freq,

67 partition_freq=partition_freq,

68 seed=seed,

69 dtypes=dtypes,

70 **kwargs,

71 )

74def _generate_mimesis(field, schema_description, records_per_partition, seed):

75 """Generate data for a single partition of a dask bag

77 See Also

78 --------

79 _make_mimesis

80 """

81 import mimesis

82 from mimesis.schema import Field, Schema

84 field = Field(seed=seed, **field)

85 # `iterations=` kwarg moved from `Schema.create()` to `Schema.__init__()`

86 # starting with `mimesis=9`.

87 schema_kwargs, create_kwargs = {}, {}

88 if Version(mimesis.__version__) < Version("9.0.0"):

89 create_kwargs["iterations"] = 1

90 else:

91 schema_kwargs["iterations"] = 1

92 schema = Schema(schema=lambda: schema_description(field), **schema_kwargs)

93 return [schema.create(**create_kwargs)[0] for i in range(records_per_partition)]

96def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):

97 """

98 Make a Dask Bag filled with data randomly generated by the mimesis project

100 Parameters

101 ----------

102 field: dict

103 keyword arguments to pass to ``mimesis.Field``

104 schema: Callable[Field] -> dict

105 The schema to use to generate the data

106 npartitions: int

107 records_per_partition: int

108 seed: int, None

109 Seed for random data

110

111 Returns

112 -------

113 Dask Bag

114

115 See Also

116 --------

117 make_people

118 """

119 import dask.bag as db

120 from dask.base import tokenize

121

122 field = field or {}

123

124 random_state = random.Random(seed)

125 seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)]

126

127 name = "mimesis-" + tokenize(

128 field, schema, npartitions, records_per_partition, seed

129 )

130 dsk = {

131 (name, i): (_generate_mimesis, field, schema, records_per_partition, seed)

132 for i, seed in enumerate(seeds)

133 }

134

135 return db.Bag(dsk, name, npartitions)

136

137

138def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):

139 """Make a dataset of random people

140

141 This makes a Dask Bag with dictionary records of randomly generated people.

142 This requires the optional library ``mimesis`` to generate records.

143

144 Parameters

145 ----------

146 npartitions : int

147 Number of partitions

148 records_per_partition : int

149 Number of records in each partition

150 seed : int, (optional)

151 Random seed

152 locale : str

153 Language locale, like 'en', 'fr', 'zh', or 'ru'

154

155 Returns

156 -------

157 b: Dask Bag

158 """

159 import_required(

160 "mimesis",

161 "The mimesis module is required for this function. Try:\n"

162 " python -m pip install mimesis",

163 )

164

165 schema = lambda field: {

166 "age": field("random.randint", a=0, b=120),

167 "name": (field("person.name"), field("person.surname")),

168 "occupation": field("person.occupation"),

169 "telephone": field("person.telephone"),

170 "address": {"address": field("address.address"), "city": field("address.city")},

171 "credit-card": {

172 "number": field("payment.credit_card_number"),

173 "expiration-date": field("payment.credit_card_expiration_date"),

174 },

175 }

176

177 return _make_mimesis(

178 {"locale": locale}, schema, npartitions, records_per_partition, seed

179 )