Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/random.py: 5%

40 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The mod:`sklearn.utils.random` module includes utilities for random sampling. 

3""" 

4 

5# Author: Hamzeh Alsalhi <ha258@cornell.edu> 

6# 

7# License: BSD 3 clause 

8import array 

9 

10import numpy as np 

11import scipy.sparse as sp 

12 

13from . import check_random_state 

14from ._random import sample_without_replacement 

15 

16__all__ = ["sample_without_replacement"] 

17 

18 

19def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None): 

20 """Generate a sparse random matrix given column class distributions 

21 

22 Parameters 

23 ---------- 

24 n_samples : int, 

25 Number of samples to draw in each column. 

26 

27 classes : list of size n_outputs of arrays of size (n_classes,) 

28 List of classes for each column. 

29 

30 class_probability : list of size n_outputs of arrays of \ 

31 shape (n_classes,), default=None 

32 Class distribution of each column. If None, uniform distribution is 

33 assumed. 

34 

35 random_state : int, RandomState instance or None, default=None 

36 Controls the randomness of the sampled classes. 

37 See :term:`Glossary <random_state>`. 

38 

39 Returns 

40 ------- 

41 random_matrix : sparse csc matrix of size (n_samples, n_outputs) 

42 

43 """ 

44 data = array.array("i") 

45 indices = array.array("i") 

46 indptr = array.array("i", [0]) 

47 

48 for j in range(len(classes)): 

49 classes[j] = np.asarray(classes[j]) 

50 if classes[j].dtype.kind != "i": 

51 raise ValueError("class dtype %s is not supported" % classes[j].dtype) 

52 classes[j] = classes[j].astype(np.int64, copy=False) 

53 

54 # use uniform distribution if no class_probability is given 

55 if class_probability is None: 

56 class_prob_j = np.empty(shape=classes[j].shape[0]) 

57 class_prob_j.fill(1 / classes[j].shape[0]) 

58 else: 

59 class_prob_j = np.asarray(class_probability[j]) 

60 

61 if not np.isclose(np.sum(class_prob_j), 1.0): 

62 raise ValueError( 

63 "Probability array at index {0} does not sum to one".format(j) 

64 ) 

65 

66 if class_prob_j.shape[0] != classes[j].shape[0]: 

67 raise ValueError( 

68 "classes[{0}] (length {1}) and " 

69 "class_probability[{0}] (length {2}) have " 

70 "different length.".format( 

71 j, classes[j].shape[0], class_prob_j.shape[0] 

72 ) 

73 ) 

74 

75 # If 0 is not present in the classes insert it with a probability 0.0 

76 if 0 not in classes[j]: 

77 classes[j] = np.insert(classes[j], 0, 0) 

78 class_prob_j = np.insert(class_prob_j, 0, 0.0) 

79 

80 # If there are nonzero classes choose randomly using class_probability 

81 rng = check_random_state(random_state) 

82 if classes[j].shape[0] > 1: 

83 index_class_0 = np.flatnonzero(classes[j] == 0).item() 

84 p_nonzero = 1 - class_prob_j[index_class_0] 

85 nnz = int(n_samples * p_nonzero) 

86 ind_sample = sample_without_replacement( 

87 n_population=n_samples, n_samples=nnz, random_state=random_state 

88 ) 

89 indices.extend(ind_sample) 

90 

91 # Normalize probabilities for the nonzero elements 

92 classes_j_nonzero = classes[j] != 0 

93 class_probability_nz = class_prob_j[classes_j_nonzero] 

94 class_probability_nz_norm = class_probability_nz / np.sum( 

95 class_probability_nz 

96 ) 

97 classes_ind = np.searchsorted( 

98 class_probability_nz_norm.cumsum(), rng.uniform(size=nnz) 

99 ) 

100 data.extend(classes[j][classes_j_nonzero][classes_ind]) 

101 indptr.append(len(indices)) 

102 

103 return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)