Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/layers/preprocessing/hashing.py: 27%

73 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Keras hashing preprocessing layer.""" 

16 

17 

18import tensorflow.compat.v2 as tf 

19 

20from keras.src import backend 

21from keras.src.engine import base_layer 

22from keras.src.engine import base_preprocessing_layer 

23from keras.src.layers.preprocessing import preprocessing_utils as utils 

24from keras.src.utils import layer_utils 

25 

26# isort: off 

27from tensorflow.python.util.tf_export import keras_export 

28 

29INT = utils.INT 

30MULTI_HOT = utils.MULTI_HOT 

31ONE_HOT = utils.ONE_HOT 

32COUNT = utils.COUNT 

33 

34 

35@keras_export( 

36 "keras.layers.Hashing", "keras.layers.experimental.preprocessing.Hashing" 

37) 

38class Hashing(base_layer.Layer): 

39 """A preprocessing layer which hashes and bins categorical features. 

40 

41 This layer transforms categorical inputs to hashed output. It element-wise 

42 converts a ints or strings to ints in a fixed range. The stable hash 

43 function uses `tensorflow::ops::Fingerprint` to produce the same output 

44 consistently across all platforms. 

45 

46 This layer uses [FarmHash64](https://github.com/google/farmhash) by default, 

47 which provides a consistent hashed output across different platforms and is 

48 stable across invocations, regardless of device and context, by mixing the 

49 input bits thoroughly. 

50 

51 If you want to obfuscate the hashed output, you can also pass a random 

52 `salt` argument in the constructor. In that case, the layer will use the 

53 [SipHash64](https://github.com/google/highwayhash) hash function, with 

54 the `salt` value serving as additional input to the hash function. 

55 

56 For an overview and full list of preprocessing layers, see the preprocessing 

57 [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers). 

58 

59 **Example (FarmHash64)** 

60 

61 >>> layer = tf.keras.layers.Hashing(num_bins=3) 

62 >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']] 

63 >>> layer(inp) 

64 <tf.Tensor: shape=(5, 1), dtype=int64, numpy= 

65 array([[1], 

66 [0], 

67 [1], 

68 [1], 

69 [2]])> 

70 

71 **Example (FarmHash64) with a mask value** 

72 

73 >>> layer = tf.keras.layers.Hashing(num_bins=3, mask_value='') 

74 >>> inp = [['A'], ['B'], [''], ['C'], ['D']] 

75 >>> layer(inp) 

76 <tf.Tensor: shape=(5, 1), dtype=int64, numpy= 

77 array([[1], 

78 [1], 

79 [0], 

80 [2], 

81 [2]])> 

82 

83 **Example (SipHash64)** 

84 

85 >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=[133, 137]) 

86 >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']] 

87 >>> layer(inp) 

88 <tf.Tensor: shape=(5, 1), dtype=int64, numpy= 

89 array([[1], 

90 [2], 

91 [1], 

92 [0], 

93 [2]])> 

94 

95 **Example (Siphash64 with a single integer, same as `salt=[133, 133]`)** 

96 

97 >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=133) 

98 >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']] 

99 >>> layer(inp) 

100 <tf.Tensor: shape=(5, 1), dtype=int64, numpy= 

101 array([[0], 

102 [0], 

103 [2], 

104 [1], 

105 [0]])> 

106 

107 Args: 

108 num_bins: Number of hash bins. Note that this includes the `mask_value` 

109 bin, so the effective number of bins is `(num_bins - 1)` if `mask_value` 

110 is set. 

111 mask_value: A value that represents masked inputs, which are mapped to 

112 index 0. `None` means no mask term will be added and the 

113 hashing will start at index 0. Defaults to `None`. 

114 salt: A single unsigned integer or None. 

115 If passed, the hash function used will be SipHash64, with these values 

116 used as an additional input (known as a "salt" in cryptography). 

117 These should be non-zero. If `None`, uses the FarmHash64 hash function. 

118 It also supports tuple/list of 2 unsigned integer numbers, see 

119 reference paper for details. Defaults to `None`. 

120 output_mode: Specification for the output of the layer. Values can bes 

121 `"int"`, `"one_hot"`, `"multi_hot"`, or 

122 `"count"` configuring the layer as follows: 

123 - `"int"`: Return the integer bin indices directly. 

124 - `"one_hot"`: Encodes each individual element in the input into an 

125 array the same size as `num_bins`, containing a 1 at the input's bin 

126 index. If the last dimension is size 1, will encode on that 

127 dimension. If the last dimension is not size 1, will append a new 

128 dimension for the encoded output. 

129 - `"multi_hot"`: Encodes each sample in the input into a single array 

130 the same size as `num_bins`, containing a 1 for each bin index 

131 index present in the sample. Treats the last dimension as the sample 

132 dimension, if input shape is `(..., sample_length)`, output shape 

133 will be `(..., num_tokens)`. 

134 - `"count"`: As `"multi_hot"`, but the int array contains a count of 

135 the number of times the bin index appeared in the sample. 

136 Defaults to `"int"`. 

137 sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, 

138 and `"count"` output modes. If True, returns a `SparseTensor` instead of 

139 a dense `Tensor`. Defaults to `False`. 

140 **kwargs: Keyword arguments to construct a layer. 

141 

142 Input shape: 

143 A single or list of string, int32 or int64 `Tensor`, 

144 `SparseTensor` or `RaggedTensor` of shape `(batch_size, ...,)` 

145 

146 Output shape: 

147 An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape 

148 `(batch_size, ...)`. If any input is `RaggedTensor` then output is 

149 `RaggedTensor`, otherwise if any input is `SparseTensor` then output is 

150 `SparseTensor`, otherwise the output is `Tensor`. 

151 

152 Reference: 

153 - [SipHash with salt](https://www.131002.net/siphash/siphash.pdf) 

154 

155 """ 

156 

157 def __init__( 

158 self, 

159 num_bins, 

160 mask_value=None, 

161 salt=None, 

162 output_mode="int", 

163 sparse=False, 

164 **kwargs, 

165 ): 

166 if num_bins is None or num_bins <= 0: 

167 raise ValueError( 

168 "The `num_bins` for `Hashing` cannot be `None` or " 

169 f"non-positive values. Received: num_bins={num_bins}." 

170 ) 

171 

172 # By default, output int64 when output_mode='int' and floats otherwise. 

173 if "dtype" not in kwargs or kwargs["dtype"] is None: 

174 kwargs["dtype"] = ( 

175 tf.int64 if output_mode == INT else backend.floatx() 

176 ) 

177 elif ( 

178 output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer 

179 ): 

180 # Compat for when dtype was always floating and ignored by the 

181 # layer. 

182 kwargs["dtype"] = tf.int64 

183 

184 super().__init__(**kwargs) 

185 base_preprocessing_layer.keras_kpl_gauge.get_cell("Hashing").set(True) 

186 

187 # Check dtype only after base layer parses it; dtype parsing is complex. 

188 if ( 

189 output_mode == INT 

190 and not tf.as_dtype(self.compute_dtype).is_integer 

191 ): 

192 input_dtype = kwargs["dtype"] 

193 raise ValueError( 

194 'When `output_mode="int"`, `dtype` should be an integer ' 

195 f"type. Received: dtype={input_dtype}" 

196 ) 

197 

198 # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT) 

199 layer_utils.validate_string_arg( 

200 output_mode, 

201 allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT), 

202 layer_name=self.__class__.__name__, 

203 arg_name="output_mode", 

204 ) 

205 

206 if sparse and output_mode == INT: 

207 raise ValueError( 

208 "`sparse` may only be true if `output_mode` is " 

209 '`"one_hot"`, `"multi_hot"`, or `"count"`. ' 

210 f"Received: sparse={sparse} and " 

211 f"output_mode={output_mode}" 

212 ) 

213 

214 self.num_bins = num_bins 

215 self.mask_value = mask_value 

216 self.strong_hash = True if salt is not None else False 

217 self.output_mode = output_mode 

218 self.sparse = sparse 

219 self.salt = None 

220 if salt is not None: 

221 if isinstance(salt, (tuple, list)) and len(salt) == 2: 

222 self.salt = salt 

223 elif isinstance(salt, int): 

224 self.salt = [salt, salt] 

225 else: 

226 raise ValueError( 

227 "The `salt` argument for `Hashing` can only be a tuple of " 

228 "size 2 integers, or a single integer. " 

229 f"Received: salt={salt}." 

230 ) 

231 

232 def call(self, inputs): 

233 inputs = utils.ensure_tensor(inputs) 

234 if isinstance(inputs, tf.SparseTensor): 

235 indices = tf.SparseTensor( 

236 indices=inputs.indices, 

237 values=self._hash_values_to_bins(inputs.values), 

238 dense_shape=inputs.dense_shape, 

239 ) 

240 else: 

241 indices = self._hash_values_to_bins(inputs) 

242 return utils.encode_categorical_inputs( 

243 indices, 

244 output_mode=self.output_mode, 

245 depth=self.num_bins, 

246 sparse=self.sparse, 

247 dtype=self.compute_dtype, 

248 ) 

249 

250 def _hash_values_to_bins(self, values): 

251 """Converts a non-sparse tensor of values to bin indices.""" 

252 hash_bins = self.num_bins 

253 mask = None 

254 # If mask_value is set, the zeroth bin is reserved for it. 

255 if self.mask_value is not None and hash_bins > 1: 

256 hash_bins -= 1 

257 mask = tf.equal(values, self.mask_value) 

258 # Convert all values to strings before hashing. 

259 if values.dtype.is_integer: 

260 values = tf.as_string(values) 

261 # Hash the strings. 

262 if self.strong_hash: 

263 values = tf.strings.to_hash_bucket_strong( 

264 values, hash_bins, name="hash", key=self.salt 

265 ) 

266 else: 

267 values = tf.strings.to_hash_bucket_fast( 

268 values, hash_bins, name="hash" 

269 ) 

270 if mask is not None: 

271 values = tf.add(values, tf.ones_like(values)) 

272 values = tf.where(mask, tf.zeros_like(values), values) 

273 return values 

274 

275 def compute_output_shape(self, input_shape): 

276 return input_shape 

277 

278 def compute_output_signature(self, input_spec): 

279 output_shape = self.compute_output_shape(input_spec.shape) 

280 if isinstance(input_spec, tf.SparseTensorSpec): 

281 return tf.SparseTensorSpec( 

282 shape=output_shape, dtype=self.compute_dtype 

283 ) 

284 else: 

285 return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype) 

286 

287 def get_config(self): 

288 config = super().get_config() 

289 config.update( 

290 { 

291 "num_bins": self.num_bins, 

292 "salt": self.salt, 

293 "mask_value": self.mask_value, 

294 "output_mode": self.output_mode, 

295 "sparse": self.sparse, 

296 } 

297 ) 

298 return config 

299