Line data Source code
1 : // Copyright 2017 The Cockroach Authors.
2 : //
3 : // Licensed under the Apache License, Version 2.0 (the "License");
4 : // you may not use this file except in compliance with the License.
5 : // You may obtain a copy of the License at
6 : //
7 : // http://www.apache.org/licenses/LICENSE-2.0
8 : //
9 : // Unless required by applicable law or agreed to in writing, software
10 : // distributed under the License is distributed on an "AS IS" BASIS,
11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12 : // implied. See the License for the specific language governing
13 : // permissions and limitations under the License. See the AUTHORS file
14 : // for names of contributors.
15 : //
16 : // ZipfGenerator implements the Incrementing Zipfian Random Number Generator from
17 : // [1]: "Quickly Generating Billion-Record Synthetic Databases"
18 : // by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994.
19 :
20 : package randvar
21 :
22 : import (
23 : "math"
24 : "sync"
25 :
26 : "github.com/cockroachdb/errors"
27 : "golang.org/x/exp/rand"
28 : )
29 :
30 : const (
31 : // See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35
32 : defaultMax = 10000000000
33 : defaultTheta = 0.99
34 : defaultZetaN = 26.46902820178302
35 : )
36 :
37 : // Zipf is a random number generator that generates random numbers from a Zipf
38 : // distribution. Unlike rand.Zipf, this generator supports incrementing the max
39 : // parameter without performing an expensive recomputation of the underlying
40 : // hidden parameters, which is a pattern used in [1] for efficiently generating
41 : // large volumes of Zipf-distributed records for synthetic data. Second,
42 : // rand.Zipf only supports theta <= 1, we suppose all values of theta.
43 : type Zipf struct {
44 : // Supplied constants.
45 : theta float64
46 : min uint64
47 : // Internally computed constants.
48 : alpha, zeta2 float64
49 : halfPowTheta float64
50 : // Mutable state.
51 : mu struct {
52 : sync.RWMutex
53 : max uint64
54 : eta float64
55 : zetaN float64
56 : }
57 : }
58 :
59 : // NewDefaultZipf constructs a new Zipf generator with the default parameters.
60 0 : func NewDefaultZipf() (*Zipf, error) {
61 0 : return NewZipf(1, defaultMax, defaultTheta)
62 0 : }
63 :
64 : // NewZipf constructs a new Zipf generator with the given parameters. Returns
65 : // an error if the parameters are outside the accepted range.
66 1 : func NewZipf(min, max uint64, theta float64) (*Zipf, error) {
67 1 : if min > max {
68 0 : return nil, errors.Errorf("min %d > max %d", errors.Safe(min), errors.Safe(max))
69 0 : }
70 1 : if theta < 0.0 || theta == 1.0 {
71 0 : return nil, errors.New("0 < theta, and theta != 1")
72 0 : }
73 :
74 1 : z := &Zipf{
75 1 : min: min,
76 1 : theta: theta,
77 1 : }
78 1 : z.mu.max = max
79 1 :
80 1 : // Compute hidden parameters.
81 1 : z.zeta2 = computeZetaFromScratch(2, theta)
82 1 : z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta)
83 1 : z.mu.zetaN = computeZetaFromScratch(max+1-min, theta)
84 1 : z.alpha = 1.0 / (1.0 - theta)
85 1 : z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-theta)) / (1.0 - z.zeta2/z.mu.zetaN)
86 1 : return z, nil
87 : }
88 :
89 : // computeZetaIncrementally recomputes zeta(max, theta), assuming that sum =
90 : // zeta(oldMax, theta). Returns zeta(max, theta), computed incrementally.
91 1 : func computeZetaIncrementally(oldMax, max uint64, theta float64, sum float64) float64 {
92 1 : if max < oldMax {
93 0 : panic("unable to decrement max!")
94 : }
95 1 : for i := oldMax + 1; i <= max; i++ {
96 1 : sum += 1.0 / math.Pow(float64(i), theta)
97 1 : }
98 1 : return sum
99 : }
100 :
101 : // The function zeta computes the value
102 : // zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta
103 1 : func computeZetaFromScratch(n uint64, theta float64) float64 {
104 1 : if n == defaultMax && theta == defaultTheta {
105 0 : // Precomputed value, borrowed from ScrambledZipfianGenerator.java. This is
106 0 : // quite slow to calculate from scratch due to the large n value.
107 0 : return defaultZetaN
108 0 : }
109 1 : return computeZetaIncrementally(0, n, theta, 0.0)
110 : }
111 :
112 : // IncMax increments max and recomputes the internal values that depend on
113 : // it. Returns an error if the recomputation failed.
114 0 : func (z *Zipf) IncMax(delta int) {
115 0 : z.mu.Lock()
116 0 : oldMax := z.mu.max
117 0 : z.mu.max += uint64(delta)
118 0 : z.mu.zetaN = computeZetaIncrementally(oldMax+1-z.min, z.mu.max+1-z.min, z.theta, z.mu.zetaN)
119 0 : z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-z.theta)) / (1.0 - z.zeta2/z.mu.zetaN)
120 0 : z.mu.Unlock()
121 0 : }
122 :
123 : // Max returns the max.
124 0 : func (z *Zipf) Max() uint64 {
125 0 : z.mu.Lock()
126 0 : defer z.mu.Unlock()
127 0 : return z.mu.max
128 0 : }
129 :
130 : // Uint64 draws a new value between min and max, with probabilities according
131 : // to the Zipf distribution.
132 0 : func (z *Zipf) Uint64(rng *rand.Rand) uint64 {
133 0 : u := rng.Float64()
134 0 : z.mu.RLock()
135 0 : uz := u * z.mu.zetaN
136 0 : var result uint64
137 0 : if uz < 1.0 {
138 0 : result = z.min
139 0 : } else if uz < z.halfPowTheta {
140 0 : result = z.min + 1
141 0 : } else {
142 0 : spread := float64(z.mu.max + 1 - z.min)
143 0 : result = z.min + uint64(spread*math.Pow(z.mu.eta*u-z.mu.eta+1.0, z.alpha))
144 0 : }
145 0 : z.mu.RUnlock()
146 0 : return result
147 : }
|