Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dask/datasets.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

32 statements  

1from __future__ import annotations 

2 

3import random 

4 

5from packaging.version import Version 

6 

7from dask.utils import import_required 

8 

9 

10def timeseries( 

11 start="2000-01-01", 

12 end="2000-01-31", 

13 freq="1s", 

14 partition_freq="1D", 

15 dtypes=None, 

16 seed=None, 

17 **kwargs, 

18): 

19 """Create timeseries dataframe with random data 

20 

21 Parameters 

22 ---------- 

23 start : datetime (or datetime-like string) 

24 Start of time series 

25 end : datetime (or datetime-like string) 

26 End of time series 

27 dtypes : dict (optional) 

28 Mapping of column names to types. 

29 Valid types include {float, int, str, 'category'} 

30 freq : string 

31 String like '2s' or '1H' or '12W' for the time series frequency 

32 partition_freq : string 

33 String like '1M' or '2Y' to divide the dataframe into partitions 

34 seed : int (optional) 

35 Randomstate seed 

36 kwargs: 

37 Keywords to pass down to individual column creation functions. 

38 Keywords should be prefixed by the column name and then an underscore. 

39 

40 Examples 

41 -------- 

42 >>> import dask 

43 >>> df = dask.datasets.timeseries() 

44 >>> df.head() # doctest: +SKIP 

45 timestamp id name x y 

46 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 

47 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 

48 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 

49 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 

50 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 

51 >>> df = dask.datasets.timeseries( 

52 ... '2000', '2010', 

53 ... freq='2h', partition_freq='1D', seed=1, # data frequency 

54 ... dtypes={'value': float, 'name': str, 'id': int}, # data types 

55 ... id_lam=1000 # control number of items in id column 

56 ... ) 

57 """ 

58 from dask.dataframe.dask_expr.datasets import timeseries 

59 

60 if dtypes is None: 

61 dtypes = {"name": str, "id": int, "x": float, "y": float} 

62 

63 return timeseries( 

64 start=start, 

65 end=end, 

66 freq=freq, 

67 partition_freq=partition_freq, 

68 seed=seed, 

69 dtypes=dtypes, 

70 **kwargs, 

71 ) 

72 

73 

74def _generate_mimesis(field, schema_description, records_per_partition, seed): 

75 """Generate data for a single partition of a dask bag 

76 

77 See Also 

78 -------- 

79 _make_mimesis 

80 """ 

81 import mimesis 

82 from mimesis.schema import Field, Schema 

83 

84 field = Field(seed=seed, **field) 

85 # `iterations=` kwarg moved from `Schema.create()` to `Schema.__init__()` 

86 # starting with `mimesis=9`. 

87 schema_kwargs, create_kwargs = {}, {} 

88 if Version(mimesis.__version__) < Version("9.0.0"): 

89 create_kwargs["iterations"] = 1 

90 else: 

91 schema_kwargs["iterations"] = 1 

92 schema = Schema(schema=lambda: schema_description(field), **schema_kwargs) 

93 return [schema.create(**create_kwargs)[0] for i in range(records_per_partition)] 

94 

95 

96def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None): 

97 """ 

98 Make a Dask Bag filled with data randomly generated by the mimesis project 

99 

100 Parameters 

101 ---------- 

102 field: dict 

103 keyword arguments to pass to ``mimesis.Field`` 

104 schema: Callable[Field] -> dict 

105 The schema to use to generate the data 

106 npartitions: int 

107 records_per_partition: int 

108 seed: int, None 

109 Seed for random data 

110 

111 Returns 

112 ------- 

113 Dask Bag 

114 

115 See Also 

116 -------- 

117 make_people 

118 """ 

119 import dask.bag as db 

120 from dask.base import tokenize 

121 

122 field = field or {} 

123 

124 random_state = random.Random(seed) 

125 seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)] 

126 

127 name = "mimesis-" + tokenize( 

128 field, schema, npartitions, records_per_partition, seed 

129 ) 

130 dsk = { 

131 (name, i): (_generate_mimesis, field, schema, records_per_partition, seed) 

132 for i, seed in enumerate(seeds) 

133 } 

134 

135 return db.Bag(dsk, name, npartitions) 

136 

137 

138def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"): 

139 """Make a dataset of random people 

140 

141 This makes a Dask Bag with dictionary records of randomly generated people. 

142 This requires the optional library ``mimesis`` to generate records. 

143 

144 Parameters 

145 ---------- 

146 npartitions : int 

147 Number of partitions 

148 records_per_partition : int 

149 Number of records in each partition 

150 seed : int, (optional) 

151 Random seed 

152 locale : str 

153 Language locale, like 'en', 'fr', 'zh', or 'ru' 

154 

155 Returns 

156 ------- 

157 b: Dask Bag 

158 """ 

159 import_required( 

160 "mimesis", 

161 "The mimesis module is required for this function. Try:\n" 

162 " python -m pip install mimesis", 

163 ) 

164 

165 schema = lambda field: { 

166 "age": field("random.randint", a=0, b=120), 

167 "name": (field("person.name"), field("person.surname")), 

168 "occupation": field("person.occupation"), 

169 "telephone": field("person.telephone"), 

170 "address": {"address": field("address.address"), "city": field("address.city")}, 

171 "credit-card": { 

172 "number": field("payment.credit_card_number"), 

173 "expiration-date": field("payment.credit_card_expiration_date"), 

174 }, 

175 } 

176 

177 return _make_mimesis( 

178 {"locale": locale}, schema, npartitions, records_per_partition, seed 

179 )