1"""
2The mod:`sklearn.utils.random` module includes utilities for random sampling.
3"""
4
5# Author: Hamzeh Alsalhi <ha258@cornell.edu>
6#
7# License: BSD 3 clause
8import array
9
10import numpy as np
11import scipy.sparse as sp
12
13from . import check_random_state
14from ._random import sample_without_replacement
15
16__all__ = ["sample_without_replacement"]
17
18
19def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
20 """Generate a sparse random matrix given column class distributions
21
22 Parameters
23 ----------
24 n_samples : int,
25 Number of samples to draw in each column.
26
27 classes : list of size n_outputs of arrays of size (n_classes,)
28 List of classes for each column.
29
30 class_probability : list of size n_outputs of arrays of \
31 shape (n_classes,), default=None
32 Class distribution of each column. If None, uniform distribution is
33 assumed.
34
35 random_state : int, RandomState instance or None, default=None
36 Controls the randomness of the sampled classes.
37 See :term:`Glossary <random_state>`.
38
39 Returns
40 -------
41 random_matrix : sparse csc matrix of size (n_samples, n_outputs)
42
43 """
44 data = array.array("i")
45 indices = array.array("i")
46 indptr = array.array("i", [0])
47
48 for j in range(len(classes)):
49 classes[j] = np.asarray(classes[j])
50 if classes[j].dtype.kind != "i":
51 raise ValueError("class dtype %s is not supported" % classes[j].dtype)
52 classes[j] = classes[j].astype(np.int64, copy=False)
53
54 # use uniform distribution if no class_probability is given
55 if class_probability is None:
56 class_prob_j = np.empty(shape=classes[j].shape[0])
57 class_prob_j.fill(1 / classes[j].shape[0])
58 else:
59 class_prob_j = np.asarray(class_probability[j])
60
61 if not np.isclose(np.sum(class_prob_j), 1.0):
62 raise ValueError(
63 "Probability array at index {0} does not sum to one".format(j)
64 )
65
66 if class_prob_j.shape[0] != classes[j].shape[0]:
67 raise ValueError(
68 "classes[{0}] (length {1}) and "
69 "class_probability[{0}] (length {2}) have "
70 "different length.".format(
71 j, classes[j].shape[0], class_prob_j.shape[0]
72 )
73 )
74
75 # If 0 is not present in the classes insert it with a probability 0.0
76 if 0 not in classes[j]:
77 classes[j] = np.insert(classes[j], 0, 0)
78 class_prob_j = np.insert(class_prob_j, 0, 0.0)
79
80 # If there are nonzero classes choose randomly using class_probability
81 rng = check_random_state(random_state)
82 if classes[j].shape[0] > 1:
83 index_class_0 = np.flatnonzero(classes[j] == 0).item()
84 p_nonzero = 1 - class_prob_j[index_class_0]
85 nnz = int(n_samples * p_nonzero)
86 ind_sample = sample_without_replacement(
87 n_population=n_samples, n_samples=nnz, random_state=random_state
88 )
89 indices.extend(ind_sample)
90
91 # Normalize probabilities for the nonzero elements
92 classes_j_nonzero = classes[j] != 0
93 class_probability_nz = class_prob_j[classes_j_nonzero]
94 class_probability_nz_norm = class_probability_nz / np.sum(
95 class_probability_nz
96 )
97 classes_ind = np.searchsorted(
98 class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
99 )
100 data.extend(classes[j][classes_j_nonzero][classes_ind])
101 indptr.append(len(indices))
102
103 return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)