Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/tpu/tpu_embedding_v2_utils.py: 32%

295 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Companion classes for mid level API for TPU Embeddings in TF2.""" 

16 

17import abc 

18import math 

19import typing 

20from typing import Any, Dict, Callable, Iterable, List, Optional, Text, Tuple, TypeVar, Union 

21 

22from absl import logging 

23 

24from tensorflow.core.protobuf.tpu import optimization_parameters_pb2 

25from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2 

26from tensorflow.python.distribute import device_util 

27from tensorflow.python.distribute import sharded_variable 

28from tensorflow.python.distribute import tpu_strategy 

29from tensorflow.python.framework import device_spec 

30from tensorflow.python.framework import ops 

31from tensorflow.python.framework.tensor_shape import TensorShape 

32from tensorflow.python.ops import init_ops_v2 

33from tensorflow.python.ops import variables as tf_variables 

34from tensorflow.python.tpu.ops import tpu_ops 

35from tensorflow.python.types import core 

36from tensorflow.python.util.tf_export import tf_export 

37 

38 

39TableVariable = TypeVar("TableVariable", sharded_variable.ShardedVariable, 

40 tf_variables.Variable) 

41SlotVarCreationFnType = Callable[ 

42 [TableVariable, List[Text], List[init_ops_v2.Initializer]], 

43 Dict[Text, TableVariable]] 

44ClipValueType = Union[Tuple[float, float], float] 

45 

46 

47class _Optimizer(metaclass=abc.ABCMeta): 

48 """Base class for all optimizers, with common parameters.""" 

49 

50 def __init__( 

51 self, 

52 learning_rate: Union[float, Callable[[], float]], 

53 use_gradient_accumulation: bool, 

54 clip_weight_min: Optional[float], 

55 clip_weight_max: Optional[float], 

56 weight_decay_factor: Optional[float], 

57 multiply_weight_decay_factor_by_learning_rate: bool, 

58 clipvalue: Optional[ClipValueType] = None, 

59 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None, 

60 low_dimensional_packing_status: bool = False, 

61 ): 

62 self.learning_rate = learning_rate 

63 self.use_gradient_accumulation = use_gradient_accumulation 

64 self.clip_weight_min = clip_weight_min 

65 self.clip_weight_max = clip_weight_max 

66 if not use_gradient_accumulation and clipvalue is not None: 

67 raise ValueError( 

68 f"When `use_gradient_accumulation` is False, gradient clipping " 

69 f"cannot be used and `clipvalue` should be left as None. " 

70 f"Received value {clipvalue} for argument `clipvalue`.") 

71 if clipvalue is None: 

72 clipvalue = (None, None) 

73 elif not isinstance(clipvalue, tuple): 

74 clipvalue = (-1. * clipvalue, clipvalue) 

75 self.clip_gradient_min, self.clip_gradient_max = clipvalue 

76 

77 self.weight_decay_factor = weight_decay_factor 

78 self.multiply_weight_decay_factor_by_learning_rate = ( 

79 multiply_weight_decay_factor_by_learning_rate) 

80 

81 if (slot_variable_creation_fn is not None and 

82 not callable(slot_variable_creation_fn)): 

83 raise ValueError( 

84 f"Argument `slot_variable_creation_fn` must be either None or a " 

85 f"callable. Received: {slot_variable_creation_fn}") 

86 self.slot_variable_creation_fn = slot_variable_creation_fn 

87 self.low_dimensional_packing_status = low_dimensional_packing_status 

88 

89 @abc.abstractmethod 

90 def _slot_names(self) -> List[Text]: 

91 """Returns the name of all the slot variables. 

92 

93 This does not include the 'parameters' variable and these names must match 

94 the names of the slots variables as used in the corresponding 

95 `tpu_ops.load_tpu_embedding_*` ops. 

96 """ 

97 raise NotImplementedError 

98 

99 @abc.abstractmethod 

100 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

101 """Returns initializers for slot variables. 

102 

103 This returns a parallel list to self._slot_names(). 

104 """ 

105 raise NotImplementedError 

106 

107 def _set_optimization_parameters( 

108 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

109 """Sets the optimizer fields in the OptimizationParameters.""" 

110 if self.use_gradient_accumulation: 

111 parameters.gradient_accumulation_status = ( 

112 optimization_parameters_pb2.GradientAccumulationStatus.ENABLED) 

113 else: 

114 parameters.gradient_accumulation_status = ( 

115 optimization_parameters_pb2.GradientAccumulationStatus.DISABLED) 

116 

117 if self.clip_weight_min is not None: 

118 parameters.clipping_limits.lower.value = self.clip_weight_min 

119 

120 if self.clip_weight_max is not None: 

121 parameters.clipping_limits.upper.value = self.clip_weight_max 

122 

123 if self.clip_gradient_min is not None: 

124 parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min 

125 

126 if self.clip_gradient_max is not None: 

127 parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max 

128 

129 if self.weight_decay_factor: 

130 parameters.weight_decay_factor = self.weight_decay_factor 

131 if self.multiply_weight_decay_factor_by_learning_rate: 

132 parameters.multiply_weight_decay_factor_by_learning_rate = True 

133 

134 parameters.low_dimensional_packing_status = ( 

135 self.low_dimensional_packing_status 

136 ) 

137 

138 @abc.abstractmethod 

139 def _load(self) -> Callable[..., ops.Operation]: 

140 """Returns the load function for the optimizer.""" 

141 raise NotImplementedError 

142 

143 @abc.abstractmethod 

144 def _retrieve(self) -> Callable[..., core.Tensor]: 

145 """Returns the retrieve function for the optimizer.""" 

146 raise NotImplementedError 

147 

148 def _create_slots( 

149 self, table: "TableConfig", 

150 variable_creator: Callable[[Text, init_ops_v2.Initializer], 

151 tf_variables.Variable] 

152 ) -> Dict[Text, tf_variables.Variable]: 

153 """Creates slot variables for table. 

154 

155 Args: 

156 table: The table variable to create slots for. 

157 variable_creator: A function which creates variables. Takes parameters 

158 'name', 'initializer'. 

159 

160 Returns: 

161 A dict of variables, keyed by self._slot_names(). 

162 """ 

163 if self.slot_variable_creation_fn is not None: 

164 return self.slot_variable_creation_fn(table, self._slot_names(), 

165 self._slot_initializers()) 

166 else: 

167 slots = {} 

168 for slot, initializer in zip(self._slot_names(), 

169 self._slot_initializers()): 

170 slots[slot] = variable_creator(slot, initializer) 

171 return slots 

172 

173 def __eq__(self, other: Any) -> Union[Any, bool]: 

174 if isinstance(other, self.__class__): 

175 return all([ 

176 attr1 == attr2 

177 for attr1, attr2 in zip(self.__dict__.items(), other.__dict__.items()) 

178 ]) 

179 else: 

180 return False 

181 

182 def __hash__(self) -> int: 

183 return hash(tuple(self.__dict__.items())) 

184 

185 

186@tf_export("tpu.experimental.embedding.SGD") 

187class SGD(_Optimizer): 

188 """Optimization parameters for stochastic gradient descent for TPU embeddings. 

189 

190 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer` 

191 argument to set the global optimizer and its parameters: 

192 

193 ``` 

194 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

195 ... 

196 optimizer=tf.tpu.experimental.embedding.SGD(0.1)) 

197 ``` 

198 

199 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the 

200 optimizer parameter to set a table specific optimizer. This will override the 

201 optimizer and parameters for global embedding optimizer defined above: 

202 

203 ``` 

204 table_one = tf.tpu.experimental.embedding.TableConfig( 

205 vocabulary_size=..., 

206 dim=..., 

207 optimizer=tf.tpu.experimental.embedding.SGD(0.2)) 

208 table_two = tf.tpu.experimental.embedding.TableConfig( 

209 vocabulary_size=..., 

210 dim=...) 

211 

212 feature_config = ( 

213 tf.tpu.experimental.embedding.FeatureConfig( 

214 table=table_one), 

215 tf.tpu.experimental.embedding.FeatureConfig( 

216 table=table_two)) 

217 

218 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

219 feature_config=feature_config, 

220 batch_size=... 

221 optimizer=tf.tpu.experimental.embedding.SGD(0.1)) 

222 ``` 

223 

224 In the above example, the first feature will be looked up in a table that has 

225 a learning rate of 0.2 while the second feature will be looked up in a table 

226 that has a learning rate of 0.1. 

227 

228 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

229 complete description of these parameters and their impacts on the optimizer 

230 algorithm. 

231 """ 

232 

233 def __init__( 

234 self, 

235 learning_rate: Union[float, Callable[[], float]] = 0.01, 

236 use_gradient_accumulation: bool = True, 

237 clip_weight_min: Optional[float] = None, 

238 clip_weight_max: Optional[float] = None, 

239 weight_decay_factor: Optional[float] = None, 

240 multiply_weight_decay_factor_by_learning_rate: bool = None, 

241 clipvalue: Optional[ClipValueType] = None, 

242 low_dimensional_packing_status: bool = False, 

243 ): 

244 """Optimization parameters for stochastic gradient descent. 

245 

246 Args: 

247 learning_rate: The learning rate. It should be a floating point value or a 

248 callable taking no arguments for a dynamic learning rate. 

249 use_gradient_accumulation: setting this to `False` makes embedding 

250 gradients calculation less accurate but faster. 

251 clip_weight_min: the minimum value to clip by; None means -infinity. 

252 clip_weight_max: the maximum value to clip by; None means +infinity. 

253 weight_decay_factor: amount of weight decay to apply; None means that the 

254 weights are not decayed. Weights are decayed by multiplying the weight 

255 by this factor each step. 

256 multiply_weight_decay_factor_by_learning_rate: if true, 

257 `weight_decay_factor` is multiplied by the current learning rate. 

258 clipvalue: Controls clipping of the gradient. Set to either a single 

259 positive scalar value to get clipping or a tiple of scalar values (min, 

260 max) to set a separate maximum or minimum. If one of the two entries is 

261 None, then there will be no clipping that direction. Note if this is 

262 set, you may see a decrease in performance as gradient accumulation 

263 will be enabled (it is normally off for SGD as it has no affect on 

264 accuracy). See 

265 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for more 

266 information on gradient accumulation and its impact on tpu embeddings. 

267 low_dimensional_packing_status: Status of the low-dimensional embedding 

268 packing optimization controls whether to optimize the packing of 

269 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in 

270 memory. 

271 """ 

272 super().__init__( 

273 learning_rate, 

274 use_gradient_accumulation, 

275 clip_weight_min, 

276 clip_weight_max, 

277 weight_decay_factor, 

278 multiply_weight_decay_factor_by_learning_rate, 

279 clipvalue, 

280 None, 

281 low_dimensional_packing_status, 

282 ) 

283 

284 def _slot_names(self) -> List[Text]: 

285 return [] 

286 

287 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

288 return [] 

289 

290 def _set_optimization_parameters( 

291 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

292 super()._set_optimization_parameters(parameters) 

293 parameters.stochastic_gradient_descent.SetInParent() 

294 

295 def _load(self) -> Callable[..., ops.Operation]: 

296 return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters 

297 

298 def _retrieve(self) -> Callable[..., core.Tensor]: 

299 return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters 

300 

301 

302@tf_export("tpu.experimental.embedding.Adagrad") 

303class Adagrad(_Optimizer): 

304 """Optimization parameters for Adagrad with TPU embeddings. 

305 

306 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer` 

307 argument to set the global optimizer and its parameters: 

308 

309 ```python 

310 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

311 ... 

312 optimizer=tf.tpu.experimental.embedding.Adagrad(0.1)) 

313 ``` 

314 

315 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the 

316 optimizer parameter to set a table specific optimizer. This will override the 

317 optimizer and parameters for global embedding optimizer defined above: 

318 

319 ```python 

320 table_one = tf.tpu.experimental.embedding.TableConfig( 

321 vocabulary_size=..., 

322 dim=..., 

323 optimizer=tf.tpu.experimental.embedding.Adagrad(0.2)) 

324 table_two = tf.tpu.experimental.embedding.TableConfig( 

325 vocabulary_size=..., 

326 dim=...) 

327 

328 feature_config = ( 

329 tf.tpu.experimental.embedding.FeatureConfig( 

330 table=table_one), 

331 tf.tpu.experimental.embedding.FeatureConfig( 

332 table=table_two)) 

333 

334 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

335 feature_config=feature_config, 

336 batch_size=... 

337 optimizer=tf.tpu.experimental.embedding.Adagrad(0.1)) 

338 ``` 

339 

340 In the above example, the first feature will be looked up in a table that has 

341 a learning rate of 0.2 while the second feature will be looked up in a table 

342 that has a learning rate of 0.1. 

343 

344 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

345 complete description of these parameters and their impacts on the optimizer 

346 algorithm. 

347 """ 

348 

349 def __init__( 

350 self, 

351 learning_rate: Union[float, Callable[[], float]] = 0.001, 

352 initial_accumulator_value: float = 0.1, 

353 use_gradient_accumulation: bool = True, 

354 clip_weight_min: Optional[float] = None, 

355 clip_weight_max: Optional[float] = None, 

356 weight_decay_factor: Optional[float] = None, 

357 multiply_weight_decay_factor_by_learning_rate: bool = None, 

358 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None, 

359 clipvalue: Optional[ClipValueType] = None, 

360 low_dimensional_packing_status: bool = False, 

361 ): 

362 """Optimization parameters for Adagrad. 

363 

364 Args: 

365 learning_rate: The learning rate. It should be a floating point value or a 

366 callable taking no arguments for a dynamic learning rate. 

367 initial_accumulator_value: initial accumulator for Adagrad. 

368 use_gradient_accumulation: setting this to `False` makes embedding 

369 gradients calculation less accurate but faster. 

370 clip_weight_min: the minimum value to clip by; None means -infinity. 

371 clip_weight_max: the maximum value to clip by; None means +infinity. 

372 weight_decay_factor: amount of weight decay to apply; None means that the 

373 weights are not decayed. 

374 multiply_weight_decay_factor_by_learning_rate: if true, 

375 `weight_decay_factor` is multiplied by the current learning rate. 

376 slot_variable_creation_fn: If you wish do directly control the creation of 

377 the slot variables, set this to a callable taking three parameters: a 

378 table variable, a list of slot names to create for it, and a list of 

379 initializers. This function should return a dict with the slot names as 

380 keys and the created variables as values with types matching the table 

381 variable. When set to None (the default), uses the built-in variable 

382 creation. 

383 clipvalue: Controls clipping of the gradient. Set to either a single 

384 positive scalar value to get clipping or a tuple of scalar values (min, 

385 max) to set a separate maximum or minimum. If one of the two entries is 

386 None, then there will be no clipping that direction. 

387 low_dimensional_packing_status: Status of the low-dimensional embedding 

388 packing optimization controls whether to optimize the packing of 

389 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in 

390 memory. 

391 """ 

392 super().__init__( 

393 learning_rate, 

394 use_gradient_accumulation, 

395 clip_weight_min, 

396 clip_weight_max, 

397 weight_decay_factor, 

398 multiply_weight_decay_factor_by_learning_rate, 

399 clipvalue, 

400 slot_variable_creation_fn, 

401 low_dimensional_packing_status, 

402 ) 

403 if initial_accumulator_value <= 0: 

404 raise ValueError( 

405 f"Argument `initial_accumulator_value` must be a positive float. " 

406 f"Received: {initial_accumulator_value}") 

407 self.initial_accumulator_value = initial_accumulator_value 

408 

409 def _slot_names(self) -> List[Text]: 

410 return ["accumulators"] 

411 

412 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

413 return [init_ops_v2.Constant(self.initial_accumulator_value)] 

414 

415 def _set_optimization_parameters( 

416 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

417 super()._set_optimization_parameters(parameters) 

418 parameters.adagrad.SetInParent() 

419 

420 def _load(self) -> Callable[..., ops.Operation]: 

421 return tpu_ops.load_tpu_embedding_adagrad_parameters 

422 

423 def _retrieve(self) -> Callable[..., core.Tensor]: 

424 return tpu_ops.retrieve_tpu_embedding_adagrad_parameters 

425 

426 

427@tf_export("tpu.experimental.embedding.AdagradMomentum") 

428class AdagradMomentum(_Optimizer): 

429 """Optimization parameters for Adagrad + Momentum with TPU embeddings. 

430 

431 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer` 

432 argument to set the global optimizer and its parameters: 

433 

434 ```python 

435 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

436 ... 

437 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.1)) 

438 ``` 

439 

440 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the 

441 optimizer parameter to set a table specific optimizer. This will override the 

442 optimizer and parameters for global embedding optimizer defined above: 

443 

444 ```python 

445 table_one = tf.tpu.experimental.embedding.TableConfig( 

446 vocabulary_size=..., 

447 dim=..., 

448 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.2)) 

449 table_two = tf.tpu.experimental.embedding.TableConfig( 

450 vocabulary_size=..., 

451 dim=...) 

452 

453 feature_config = ( 

454 tf.tpu.experimental.embedding.FeatureConfig( 

455 table=table_one), 

456 tf.tpu.experimental.embedding.FeatureConfig( 

457 table=table_two)) 

458 

459 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

460 feature_config=feature_config, 

461 batch_size=... 

462 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.1)) 

463 ``` 

464 

465 In the above example, the first feature will be looked up in a table that has 

466 a learning rate of 0.2 while the second feature will be looked up in a table 

467 that has a learning rate of 0.1. 

468 

469 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

470 complete description of these parameters and their impacts on the optimizer 

471 algorithm. 

472 """ 

473 

474 def __init__( 

475 self, 

476 learning_rate: Union[float, Callable[[], float]] = 0.001, 

477 momentum: float = 0.0, 

478 use_nesterov: bool = False, 

479 exponent: float = 2, 

480 beta2: float = 1, 

481 epsilon: float = 1e-10, 

482 use_gradient_accumulation: bool = True, 

483 clip_weight_min: Optional[float] = None, 

484 clip_weight_max: Optional[float] = None, 

485 weight_decay_factor: Optional[float] = None, 

486 multiply_weight_decay_factor_by_learning_rate: bool = None, 

487 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None, 

488 clipvalue: Optional[ClipValueType] = None, 

489 low_dimensional_packing_status: bool = False, 

490 ): 

491 """Optimization parameters for Adagrad + Momentum. 

492 

493 Args: 

494 learning_rate: The learning rate. It should be a floating point value or a 

495 callable taking no arguments for a dynamic learning rate. 

496 momentum: Moving average parameter for the momentum accumulator. 

497 use_nesterov: Whether to use the Nesterov variant of momentum. See 

498 Sutskever et al., 2013. 

499 exponent: Exponent for the Adagrad accumulator. 

500 beta2: Moving average parameter for the Adagrad accumulator. 

501 epsilon: initial accumulator for Adagrad accumulator. 

502 use_gradient_accumulation: setting this to `False` makes embedding 

503 gradients calculation less accurate but faster. 

504 clip_weight_min: the minimum value to clip by; None means -infinity. 

505 clip_weight_max: the maximum value to clip by; None means +infinity. 

506 weight_decay_factor: amount of weight decay to apply; None means that the 

507 weights are not decayed. 

508 multiply_weight_decay_factor_by_learning_rate: if true, 

509 `weight_decay_factor` is multiplied by the current learning rate. 

510 slot_variable_creation_fn: If you wish do directly control the creation of 

511 the slot variables, set this to a callable taking three parameters: a 

512 table variable, a list of slot names to create for it, and a list of 

513 initializers. This function should return a dict with the slot names as 

514 keys and the created variables as values with types matching the table 

515 variable. When set to None (the default), uses the built-in variable 

516 creation. 

517 clipvalue: Controls clipping of the gradient. Set to either a single 

518 positive scalar value to get clipping or a tuple of scalar values (min, 

519 max) to set a separate maximum or minimum. If one of the two entries is 

520 None, then there will be no clipping that direction. 

521 low_dimensional_packing_status: Status of the low-dimensional embedding 

522 packing optimization controls whether to optimize the packing of 

523 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in 

524 memory. 

525 """ 

526 super().__init__( 

527 learning_rate, 

528 use_gradient_accumulation, 

529 clip_weight_min, 

530 clip_weight_max, 

531 weight_decay_factor, 

532 multiply_weight_decay_factor_by_learning_rate, 

533 clipvalue, 

534 slot_variable_creation_fn, 

535 low_dimensional_packing_status, 

536 ) 

537 if epsilon <= 0: 

538 raise ValueError("Adagrad momentum: epsilon must be positive") 

539 if exponent <= 0: 

540 raise ValueError("Adagrad momentum: Precondition exponent must >0") 

541 self.momentum = momentum 

542 self.use_nesterov = use_nesterov 

543 self.exponent = exponent 

544 self.beta2 = beta2 

545 self.epsilon = epsilon 

546 

547 def _slot_names(self) -> List[Text]: 

548 return ["accumulators", "momenta"] 

549 

550 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

551 return [init_ops_v2.Constant(), init_ops_v2.Constant()] 

552 

553 def _set_optimization_parameters( 

554 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

555 super()._set_optimization_parameters(parameters) 

556 parameters.adagrad_momentum.SetInParent() 

557 parameters.adagrad_momentum.momentum = self.momentum 

558 parameters.adagrad_momentum.use_nesterov = self.use_nesterov 

559 parameters.adagrad_momentum.exponent = self.exponent 

560 parameters.adagrad_momentum.beta2 = self.beta2 

561 parameters.adagrad_momentum.epsilon = self.epsilon 

562 

563 def _load(self) -> Callable[..., ops.Operation]: 

564 return tpu_ops.load_tpu_embedding_adagrad_momentum_parameters 

565 

566 def _retrieve(self) -> Callable[..., core.Tensor]: 

567 return tpu_ops.retrieve_tpu_embedding_adagrad_momentum_parameters 

568 

569 

570@tf_export("tpu.experimental.embedding.FTRL") 

571class FTRL(_Optimizer): 

572 """Optimization parameters for FTRL with TPU embeddings. 

573 

574 See Algorithm 1 of this 

575 [paper](https://research.google.com/pubs/archive/41159.pdf). 

576 

577 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer` 

578 argument to set the global optimizer and its parameters: 

579 

580 ```python 

581 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

582 ... 

583 optimizer=tf.tpu.experimental.embedding.FTRL(0.1)) 

584 ``` 

585 

586 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the 

587 optimizer parameter to set a table specific optimizer. This will override the 

588 optimizer and parameters for global embedding optimizer defined above: 

589 

590 ```python 

591 table_one = tf.tpu.experimental.embedding.TableConfig( 

592 vocabulary_size=..., 

593 dim=..., 

594 optimizer=tf.tpu.experimental.embedding.FTRL(0.2)) 

595 table_two = tf.tpu.experimental.embedding.TableConfig( 

596 vocabulary_size=..., 

597 dim=...) 

598 

599 feature_config = ( 

600 tf.tpu.experimental.embedding.FeatureConfig( 

601 table=table_one), 

602 tf.tpu.experimental.embedding.FeatureConfig( 

603 table=table_two)) 

604 

605 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

606 feature_config=feature_config, 

607 batch_size=... 

608 optimizer=tf.tpu.experimental.embedding.FTRL(0.1)) 

609 ``` 

610 

611 In the above example, the first feature will be looked up in a table that has 

612 a learning rate of 0.2 while the second feature will be looked up in a table 

613 that has a learning rate of 0.1. 

614 

615 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

616 complete description of these parameters and their impacts on the optimizer 

617 algorithm. 

618 """ 

619 

620 def __init__( 

621 self, 

622 learning_rate: Union[float, Callable[[], float]] = 0.001, 

623 learning_rate_power: float = -0.5, 

624 l1_regularization_strength: float = 0.0, 

625 l2_regularization_strength: float = 0.0, 

626 beta: float = 0.0, 

627 initial_accumulator_value: float = 0.1, 

628 use_gradient_accumulation: bool = True, 

629 clip_weight_min: Optional[float] = None, 

630 clip_weight_max: Optional[float] = None, 

631 weight_decay_factor: Optional[float] = None, 

632 multiply_weight_decay_factor_by_learning_rate: bool = None, 

633 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None, 

634 clipvalue: Optional[ClipValueType] = None, 

635 multiply_linear_by_learning_rate: bool = False, 

636 allow_zero_accumulator: bool = False, 

637 low_dimensional_packing_status: bool = False, 

638 ): 

639 """Optimization parameters for Adagrad. 

640 

641 Args: 

642 learning_rate: The learning rate. It should be a floating point value or a 

643 callable taking no arguments for a dynamic learning rate. 

644 learning_rate_power: A float value, must be less or equal to zero. 

645 Controls how the learning rate decreases during training. Use zero for a 

646 fixed learning rate. 

647 l1_regularization_strength: A float value, must be greater than or equal 

648 to zero. 

649 l2_regularization_strength: A float value, must be greater than or equal 

650 to zero. 

651 beta: A float value, representing the beta value from the paper. 

652 initial_accumulator_value: The starting value for accumulators. Only zero 

653 or positive values are allowed. 

654 use_gradient_accumulation: setting this to `False` makes embedding 

655 gradients calculation less accurate but faster. 

656 clip_weight_min: the minimum value to clip by; None means -infinity. 

657 clip_weight_max: the maximum value to clip by; None means +infinity. 

658 weight_decay_factor: amount of weight decay to apply; None means that the 

659 weights are not decayed. 

660 multiply_weight_decay_factor_by_learning_rate: if true, 

661 `weight_decay_factor` is multiplied by the current learning rate. 

662 slot_variable_creation_fn: If you wish do directly control the creation of 

663 the slot variables, set this to a callable taking three parameters: a 

664 table variable, a list of slot names to create for it, and a list of 

665 initializers. This function should return a dict with the slot names as 

666 keys and the created variables as values with types matching the table 

667 variable. When set to None (the default), uses the built-in variable 

668 creation. 

669 clipvalue: Controls clipping of the gradient. Set to either a single 

670 positive scalar value to get clipping or a tuple of scalar values (min, 

671 max) to set a separate maximum or minimum. If one of the two entries is 

672 None, then there will be no clipping that direction. 

673 multiply_linear_by_learning_rate: If set to True, a modified formula is 

674 used for FTRL that treats the "linear" accumulator as being 

675 pre-multiplied by the learning rate (i.e., the accumulator named 

676 "linear" actually stores "linear * learning_rate"). Other than 

677 checkpoint compatibility, this is mathematically equivalent for a static 

678 learning rate; for a dynamic learning rate, it is nearly the same as 

679 long as the learning rate does not change quickly. The benefit of this 

680 is that the modified formula handles zero and near-zero learning rates 

681 without producing NaNs, improving flexibility for learning rate ramp-up. 

682 allow_zero_accumulator: If set to True, changes some internal formulas to 

683 allow zero and near-zero accumulator values at the cost of some 

684 performance; this only needs to be set if you are using an initial 

685 accumulator value of zero, which is uncommon. 

686 low_dimensional_packing_status: Status of the low-dimensional embedding 

687 packing optimization controls whether to optimize the packing of 

688 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in 

689 memory. 

690 """ 

691 super().__init__( 

692 learning_rate, 

693 use_gradient_accumulation, 

694 clip_weight_min, 

695 clip_weight_max, 

696 weight_decay_factor, 

697 multiply_weight_decay_factor_by_learning_rate, 

698 clipvalue, 

699 slot_variable_creation_fn, 

700 low_dimensional_packing_status, 

701 ) 

702 if initial_accumulator_value <= 0: 

703 raise ValueError( 

704 f"Argument `initial_accumulator_value` must be a positive float. " 

705 f"Received: {initial_accumulator_value}") 

706 self.initial_accumulator_value = initial_accumulator_value 

707 self.learning_rate_power = learning_rate_power 

708 self.l1_regularization_strength = l1_regularization_strength 

709 self.l2_regularization_strength = l2_regularization_strength 

710 self.beta = beta 

711 self.multiply_linear_by_learning_rate = multiply_linear_by_learning_rate 

712 self.allow_zero_accumulator = allow_zero_accumulator 

713 

714 def _slot_names(self) -> List[Text]: 

715 return ["accumulators", "linears"] 

716 

717 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

718 return [ 

719 init_ops_v2.Constant(self.initial_accumulator_value), 

720 init_ops_v2.Constant() 

721 ] 

722 

723 def _set_optimization_parameters( 

724 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

725 super()._set_optimization_parameters(parameters) 

726 ftrl = parameters.ftrl 

727 ftrl.l1 = self.l1_regularization_strength 

728 ftrl.l2 = self.l2_regularization_strength 

729 ftrl.lr_power = self.learning_rate_power 

730 ftrl.beta = self.beta 

731 ftrl.multiply_linear_by_lr = self.multiply_linear_by_learning_rate 

732 ftrl.allow_zero_accumulator = self.allow_zero_accumulator 

733 

734 def _load(self) -> Callable[..., ops.Operation]: 

735 return tpu_ops.load_tpu_embedding_ftrl_parameters 

736 

737 def _retrieve(self) -> Callable[..., core.Tensor]: 

738 return tpu_ops.retrieve_tpu_embedding_ftrl_parameters 

739 

740 

741@tf_export("tpu.experimental.embedding.Adam") 

742class Adam(_Optimizer): 

743 """Optimization parameters for Adam with TPU embeddings. 

744 

745 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer` 

746 argument to set the global optimizer and its parameters: 

747 

748 NOTE: By default this optimizer is lazy, i.e. it will not apply the gradient 

749 update of zero to rows that were not looked up. You can change this behavior 

750 by setting `lazy_adam` to `False`. 

751 

752 ```python 

753 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

754 ... 

755 optimizer=tf.tpu.experimental.embedding.Adam(0.1)) 

756 ``` 

757 

758 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the 

759 optimizer parameter to set a table specific optimizer. This will override the 

760 optimizer and parameters for global embedding optimizer defined above: 

761 

762 ```python 

763 table_one = tf.tpu.experimental.embedding.TableConfig( 

764 vocabulary_size=..., 

765 dim=..., 

766 optimizer=tf.tpu.experimental.embedding.Adam(0.2)) 

767 table_two = tf.tpu.experimental.embedding.TableConfig( 

768 vocabulary_size=..., 

769 dim=...) 

770 

771 feature_config = ( 

772 tf.tpu.experimental.embedding.FeatureConfig( 

773 table=table_one), 

774 tf.tpu.experimental.embedding.FeatureConfig( 

775 table=table_two)) 

776 

777 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

778 feature_config=feature_config, 

779 batch_size=... 

780 optimizer=tf.tpu.experimental.embedding.Adam(0.1)) 

781 ``` 

782 

783 In the above example, the first feature will be looked up in a table that has 

784 a learning rate of 0.2 while the second feature will be looked up in a table 

785 that has a learning rate of 0.1. 

786 

787 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

788 complete description of these parameters and their impacts on the optimizer 

789 algorithm. 

790 """ 

791 

792 def __init__( 

793 self, 

794 learning_rate: Union[float, Callable[[], float]] = 0.001, 

795 beta_1: float = 0.9, 

796 beta_2: float = 0.999, 

797 epsilon: float = 1e-07, 

798 lazy_adam: bool = True, 

799 sum_inside_sqrt: bool = True, 

800 use_gradient_accumulation: bool = True, 

801 clip_weight_min: Optional[float] = None, 

802 clip_weight_max: Optional[float] = None, 

803 weight_decay_factor: Optional[float] = None, 

804 multiply_weight_decay_factor_by_learning_rate: bool = None, 

805 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None, 

806 clipvalue: Optional[ClipValueType] = None, 

807 low_dimensional_packing_status: bool = False, 

808 ): 

809 """Optimization parameters for Adam. 

810 

811 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a 

812 complete description of these parameters and their impacts on the optimizer 

813 algorithm. 

814 

815 Args: 

816 learning_rate: The learning rate. It should be a floating point value or a 

817 callable taking no arguments for a dynamic learning rate. 

818 beta_1: A float value. The exponential decay rate for the 1st moment 

819 estimates. 

820 beta_2: A float value. The exponential decay rate for the 2nd moment 

821 estimates. 

822 epsilon: A small constant for numerical stability. 

823 lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster. 

824 sum_inside_sqrt: When this is true, the Adam update formula is changed 

825 from `m / (sqrt(v) + epsilon)` to `m / sqrt(v + epsilon**2)`. This 

826 option improves the performance of TPU training and is not expected to 

827 harm model quality. 

828 use_gradient_accumulation: Setting this to `False` makes embedding 

829 gradients calculation less accurate but faster. 

830 clip_weight_min: the minimum value to clip by; None means -infinity. 

831 clip_weight_max: the maximum value to clip by; None means +infinity. 

832 weight_decay_factor: amount of weight decay to apply; None means that the 

833 weights are not decayed. 

834 multiply_weight_decay_factor_by_learning_rate: if true, 

835 `weight_decay_factor` is multiplied by the current learning rate. 

836 slot_variable_creation_fn: If you wish do directly control the creation of 

837 the slot variables, set this to a callable taking three parameters: a 

838 table variable, a list of slot names to create for it, and a list of 

839 initializers. This function should return a dict with the slot names as 

840 keys and the created variables as values with types matching the table 

841 variable. When set to None (the default), uses the built-in variable 

842 creation. 

843 clipvalue: Controls clipping of the gradient. Set to either a single 

844 positive scalar value to get clipping or a tiple of scalar values (min, 

845 max) to set a separate maximum or minimum. If one of the two entries is 

846 None, then there will be no clipping that direction. 

847 low_dimensional_packing_status: Status of the low-dimensional embedding 

848 packing optimization controls whether to optimize the packing of 

849 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in 

850 memory. 

851 """ 

852 super(Adam, self).__init__( 

853 learning_rate, 

854 use_gradient_accumulation, 

855 clip_weight_min, 

856 clip_weight_max, 

857 weight_decay_factor, 

858 multiply_weight_decay_factor_by_learning_rate, 

859 clipvalue, 

860 slot_variable_creation_fn, 

861 low_dimensional_packing_status, 

862 ) 

863 if beta_1 < 0. or beta_1 >= 1.: 

864 raise ValueError( 

865 f"Argument `beta_1` must be >= 0 and < 1. Received: {beta_1}.") 

866 if beta_2 < 0. or beta_2 >= 1.: 

867 raise ValueError( 

868 f"Argument `beta_2` must be >= 0 and < 1. Received: {beta_1}.") 

869 if epsilon <= 0.: 

870 raise ValueError("epsilon must be positive; got {}.".format(epsilon)) 

871 if not use_gradient_accumulation and not lazy_adam: 

872 raise ValueError( 

873 "When disabling lazy Adam (`lazy_adam=False`), " 

874 "gradient accumulation must be used. " 

875 "Set `use_gradient_accumulation` to False.") 

876 

877 self.beta_1 = beta_1 

878 self.beta_2 = beta_2 

879 self.epsilon = epsilon 

880 self.lazy_adam = lazy_adam 

881 self.sum_inside_sqrt = sum_inside_sqrt 

882 

883 def _slot_names(self) -> List[Text]: 

884 return ["momenta", "velocities"] 

885 

886 def _slot_initializers(self) -> List[init_ops_v2.Initializer]: 

887 return [init_ops_v2.Constant(), init_ops_v2.Constant()] 

888 

889 def _set_optimization_parameters( 

890 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

891 super(Adam, self)._set_optimization_parameters(parameters) 

892 parameters.adam.beta1 = self.beta_1 

893 parameters.adam.beta2 = self.beta_2 

894 parameters.adam.epsilon = self.epsilon 

895 parameters.adam.use_non_lazy_adam = not self.lazy_adam 

896 parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt 

897 

898 def _load(self) -> Callable[..., ops.Operation]: 

899 return tpu_ops.load_tpu_embedding_adam_parameters 

900 

901 def _retrieve(self) -> Callable[..., core.Tensor]: 

902 return tpu_ops.retrieve_tpu_embedding_adam_parameters 

903 

904 

905@tf_export("tpu.experimental.embedding.QuantizationConfig") 

906class QuantizationConfig: 

907 """Settings for simulated quantization of the tpu embedding table. 

908 

909 When simulated quantization is enabled, the results of the embedding lookup 

910 are clipped and quantized according to the settings here before the combiner 

911 is applied. 

912 

913 For example, to quantize `input` the following is done: 

914 ```python 

915 if input < lower 

916 input = lower 

917 if input > upper 

918 input = upper 

919 quantum = (upper - lower) / (num_buckets - 1) 

920 input = math.floor((input - lower) / quantum + 0.5) * quantium + lower 

921 ``` 

922 

923 See tensorflow/core/protobuf/tpu/optimization_parameters.proto for more 

924 details. 

925 

926 NOTE: This does not change the storage type of the embedding table, that will 

927 continue to be float32 as will the saved variable in the checkpoint. You will 

928 have to manually quantize the variable (typically with the same algorithm and 

929 settings as above) manually. 

930 """ 

931 

932 def __init__(self, num_buckets: int, lower: float, upper: float): 

933 """Simulated quantizaiton configuration. 

934 

935 Args: 

936 num_buckets: The number of quantization buckets, must be atleast 2. 

937 lower: The lower bound for the quantization range. 

938 upper: The upper bound for the quantization range. 

939 

940 Returns: 

941 `QuantizationConfig`. 

942 

943 Raises: 

944 ValueError: if `num_buckets` is less than 2. 

945 """ 

946 if num_buckets < 2: 

947 raise ValueError(f"num_buckets is {num_buckets}, must be at least 2 for " 

948 f"simulated quantization.") 

949 

950 self.num_buckets = num_buckets 

951 self.lower = lower 

952 self.upper = upper 

953 

954 def _set_optimization_parameters( 

955 self, parameters: optimization_parameters_pb2.OptimizationParameters): 

956 parameters.simulated_quantization.enabled = True 

957 parameters.simulated_quantization.num_buckets = self.num_buckets 

958 parameters.simulated_quantization.clipping_limits.lower.value = self.lower 

959 parameters.simulated_quantization.clipping_limits.upper.value = self.upper 

960 

961 def __repr__(self): 

962 return ("QuantizationConfig(num_buckets={num_buckets!r}, lower={lower!r}, " 

963 "upper={upper!r})".format( 

964 num_buckets=self.num_buckets, 

965 lower=self.lower, 

966 upper=self.upper)) 

967 

968 

969@tf_export("tpu.experimental.embedding.TableConfig") 

970class TableConfig: 

971 """Configuration data for one embedding table. 

972 

973 This class holds the configuration data for a single embedding table. It is 

974 used as the `table` parameter of a 

975 `tf.tpu.experimental.embedding.FeatureConfig`. Multiple 

976 `tf.tpu.experimental.embedding.FeatureConfig` objects can use the same 

977 `tf.tpu.experimental.embedding.TableConfig` object. In this case a shared 

978 table will be created for those feature lookups. 

979 

980 ```python 

981 table_config_one = tf.tpu.experimental.embedding.TableConfig( 

982 vocabulary_size=..., 

983 dim=...) 

984 table_config_two = tf.tpu.experimental.embedding.TableConfig( 

985 vocabulary_size=..., 

986 dim=...) 

987 feature_config = { 

988 'feature_one': tf.tpu.experimental.embedding.FeatureConfig( 

989 table=table_config_one), 

990 'feature_two': tf.tpu.experimental.embedding.FeatureConfig( 

991 table=table_config_one), 

992 'feature_three': tf.tpu.experimental.embedding.FeatureConfig( 

993 table=table_config_two)} 

994 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

995 feature_config=feature_config, 

996 batch_size=... 

997 optimizer=tf.tpu.experimental.embedding.Adam(0.1)) 

998 ``` 

999 

1000 The above configuration has 2 tables, and three features. The first two 

1001 features will be looked up in the first table and the third feature will be 

1002 looked up in the second table. 

1003 

1004 """ 

1005 

1006 def __init__(self, 

1007 vocabulary_size: int, 

1008 dim: int, 

1009 initializer: Optional[Callable[[Any], None]] = None, 

1010 optimizer: Optional[_Optimizer] = None, 

1011 combiner: Text = "mean", 

1012 name: Optional[Text] = None, 

1013 quantization_config: QuantizationConfig = None): 

1014 """Embedding table configuration. 

1015 

1016 Args: 

1017 vocabulary_size: Size of the table's vocabulary (number of rows). 

1018 dim: The embedding dimension (width) of the table. 

1019 initializer: A callable initializer taking one parameter, the shape of the 

1020 variable that will be initialized. Will be called once per task, to 

1021 initialize that task's shard of the embedding table. If not specified, 

1022 defaults to `truncated_normal_initializer` with mean `0.0` and standard 

1023 deviation `1/sqrt(dim)`. 

1024 optimizer: An optional instance of an optimizer parameters class, instance 

1025 of one of `tf.tpu.experimental.embedding.SGD`, 

1026 `tf.tpu.experimental.embedding.Adagrad` or 

1027 `tf.tpu.experimental.embedding.Adam`. If set will override the global 

1028 optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`. 

1029 combiner: A string specifying how to reduce if there are multiple entries 

1030 in a single row. Currently 'mean', 'sqrtn', 'sum' are supported, with 

1031 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 

1032 with bag-of-words columns. For more information, see 

1033 `tf.nn.embedding_lookup_sparse`. 

1034 name: An optional string used to name the table. Useful for debugging. 

1035 quantization_config: The simulated quantization config. An instance of 

1036 `tf.tpu.experimental.embedding.QuantizationConfig`. See the class for 

1037 more documentation. 

1038 

1039 Returns: 

1040 `TableConfig`. 

1041 

1042 Raises: 

1043 ValueError: if `vocabulary_size` is not a positive integer. 

1044 ValueError: if `dim` is not a positive integer. 

1045 ValueError: if `initializer` is specified and is not callable. 

1046 ValueError: if `combiner` is not supported. 

1047 """ 

1048 if not isinstance(vocabulary_size, int) or vocabulary_size < 1: 

1049 raise ValueError( 

1050 f"Argument `vocabulary_size` must be an int and must be >= 1. " 

1051 f"Received: {vocabulary_size}") 

1052 

1053 if not isinstance(dim, int) or dim < 1: 

1054 raise ValueError( 

1055 f"Argument `dim` (embedding dimension) " 

1056 f"must be an int and must be >= 1. Received: {dim}") 

1057 

1058 if (initializer is not None) and (not callable(initializer)): 

1059 raise ValueError( 

1060 f"Argument `initializer` must be a callable (or None). " 

1061 f"Received: {initializer}") 

1062 if initializer is None: 

1063 initializer = init_ops_v2.TruncatedNormal(mean=0.0, 

1064 stddev=1/math.sqrt(dim)) 

1065 accepted_combiners = ("mean", "sum", "sqrtn") 

1066 if combiner not in accepted_combiners: 

1067 raise ValueError( 

1068 f"Argument `combiner` must be one of {accepted_combiners}. " 

1069 f"Received: {combiner}") 

1070 

1071 self.vocabulary_size = vocabulary_size 

1072 self.dim = dim 

1073 self.initializer = initializer 

1074 self.optimizer = optimizer 

1075 self.combiner = combiner 

1076 self.name = name 

1077 self.quantization_config = quantization_config 

1078 

1079 def __repr__(self): 

1080 # If using the default initializer, just print "None" for clarity. 

1081 initializer = self.initializer 

1082 

1083 if isinstance(initializer, init_ops_v2.TruncatedNormal): 

1084 # PY2 type checking can't infer type of initializer even after if. 

1085 initializer = typing.cast(init_ops_v2.TruncatedNormal, initializer) 

1086 if (initializer.mean == 0.0 

1087 and math.isclose(initializer.stddev, 1/math.sqrt(self.dim))): 

1088 initializer = None 

1089 

1090 return ("TableConfig(vocabulary_size={vocabulary_size!r}, dim={dim!r}, " 

1091 "initializer={initializer!r}, optimizer={optimizer!r}, " 

1092 "combiner={combiner!r}, name={name!r}, " 

1093 "quantization_config={quantization!r})".format( 

1094 vocabulary_size=self.vocabulary_size, 

1095 dim=self.dim, 

1096 initializer=initializer, 

1097 optimizer=self.optimizer, 

1098 combiner=self.combiner, 

1099 name=self.name, 

1100 quantization=self.quantization_config, 

1101 )) 

1102 

1103 def _set_table_descriptor( 

1104 self, 

1105 table_descriptor: tpu_embedding_configuration_pb2 

1106 .TPUEmbeddingConfiguration.TableDescriptor, 

1107 num_hosts: int, 

1108 learning_rate_index: Dict[Callable[[], Any], int]): 

1109 """Set the table descriptor from the table data.""" 

1110 table_descriptor.name = self.name 

1111 

1112 # For small tables, we pad to the number of hosts so that at least one 

1113 # id will be assigned to each host. 

1114 table_descriptor.vocabulary_size = max(self.vocabulary_size, num_hosts) 

1115 table_descriptor.dimension = self.dim 

1116 

1117 parameters = table_descriptor.optimization_parameters 

1118 

1119 # We handle the learning rate separately here and don't allow the 

1120 # optimization class to handle this, as it doesn't know about dynamic 

1121 # rates. 

1122 if callable(self.optimizer.learning_rate): 

1123 parameters.learning_rate.dynamic.tag = ( 

1124 learning_rate_index[self.optimizer.learning_rate]) 

1125 else: 

1126 parameters.learning_rate.constant = self.optimizer.learning_rate 

1127 

1128 if self.optimizer.low_dimensional_packing_status: 

1129 parameters.low_dimensional_packing_status = ( 

1130 optimization_parameters_pb2.LowDimensionalPackingStatus.Status.ENABLED 

1131 ) 

1132 # Use optimizer to handle the rest of the parameters. 

1133 self.optimizer._set_optimization_parameters(parameters) # pylint: disable=protected-access 

1134 if self.quantization_config: 

1135 self.quantization_config._set_optimization_parameters(parameters) # pylint: disable=protected-access 

1136 

1137 

1138@tf_export("tpu.experimental.embedding.FeatureConfig") 

1139class FeatureConfig: 

1140 """Configuration data for one embedding feature. 

1141 

1142 This class holds the configuration data for a single embedding feature. The 

1143 main use is to assign features to `tf.tpu.experimental.embedding.TableConfig`s 

1144 via the table parameter: 

1145 

1146 ```python 

1147 table_config_one = tf.tpu.experimental.embedding.TableConfig( 

1148 vocabulary_size=..., 

1149 dim=...) 

1150 table_config_two = tf.tpu.experimental.embedding.TableConfig( 

1151 vocabulary_size=..., 

1152 dim=...) 

1153 feature_config = { 

1154 'feature_one': tf.tpu.experimental.embedding.FeatureConfig( 

1155 table=table_config_one), 

1156 'feature_two': tf.tpu.experimental.embedding.FeatureConfig( 

1157 table=table_config_one), 

1158 'feature_three': tf.tpu.experimental.embedding.FeatureConfig( 

1159 table=table_config_two)} 

1160 embedding = tf.tpu.experimental.embedding.TPUEmbedding( 

1161 feature_config=feature_config, 

1162 batch_size=... 

1163 optimizer=tf.tpu.experimental.embedding.Adam(0.1)) 

1164 ``` 

1165 

1166 The above configuration has 2 tables, and three features. The first two 

1167 features will be looked up in the first table and the third feature will be 

1168 looked up in the second table. 

1169 

1170 You can also specify the output shape for each feature. The output shape 

1171 should be the expected activation shape excluding the table dimension. For 

1172 dense and sparse tensor, the output shape should be the same as the input 

1173 shape excluding the last dimension. For ragged tensor, the output shape can 

1174 mismatch the input shape. 

1175 

1176 NOTE: The `max_sequence_length` will be only used when the input tensor has 

1177 rank 2 and the `output_shape` is not set in the feature config. 

1178 

1179 When feeding features into `embedding.enqueue` they can be `tf.Tensor`s, 

1180 `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument 

1181 `max_sequence_length` is 0, the default, you should expect a output of 

1182 `embedding.dequeue` for this feature of shape `(batch_size, dim)`. If 

1183 `max_sequence_length` is greater than 0, the feature is embedded as a sequence 

1184 and padded up to the given length. The shape of the output for this feature 

1185 will be `(batch_size, max_sequence_length, dim)`. 

1186 """ 

1187 

1188 def __init__(self, 

1189 table: TableConfig, 

1190 max_sequence_length: int = 0, 

1191 validate_weights_and_indices: bool = True, 

1192 output_shape: Optional[Union[List[int], TensorShape]] = None, 

1193 name: Optional[Text] = None): 

1194 """Feature configuration. 

1195 

1196 Args: 

1197 table: An instance of `tf.tpu.experimental.embedding.TableConfig`, 

1198 describing the table in which this feature should be looked up. 

1199 max_sequence_length: If positive, the feature is a sequence feature with 

1200 the corresponding maximum sequence length. If the sequence is longer 

1201 than this, it will be truncated. If 0, the feature is not a sequence 

1202 feature. 

1203 validate_weights_and_indices: If true, uses safe_embedding_lookup during 

1204 serving which ensures there are no empty rows and all weights and ids 

1205 are positive at the expense of extra compute cost. 

1206 output_shape: Optional argument to config the output shape of the feature 

1207 activation. If provided, the feature feeding to the `embedding.enqueue` 

1208 has to match the shape (for ragged tensor, the input shape and output 

1209 shape can mismatch). If not provided, the shape can be either provided 

1210 to the `embedding.build` or auto detected at the runtime. 

1211 name: An optional name for the feature, useful for debugging. 

1212 

1213 Returns: 

1214 `FeatureConfig`. 

1215 

1216 Raises: 

1217 ValueError: if `table` is not an instance of 

1218 `tf.tpu.experimental.embedding.TableConfig`. 

1219 ValueError: if `max_sequence_length` not an integer or is negative. 

1220 """ 

1221 if not isinstance(table, TableConfig): 

1222 raise ValueError(f"Argument `table` has invalid type {type(table)}. " 

1223 "Expected `tf.tpu.experimental.embedding.TableConfig`.") 

1224 

1225 if not isinstance(max_sequence_length, int) or max_sequence_length < 0: 

1226 raise ValueError( 

1227 f"Argument `max_sequence_length` must be an int and must be >= 0. " 

1228 f"Received: {max_sequence_length}") 

1229 

1230 self.table = table 

1231 self.max_sequence_length = max_sequence_length 

1232 self.name = name 

1233 self.output_shape = TensorShape(output_shape) 

1234 

1235 if not isinstance( 

1236 validate_weights_and_indices, bool): 

1237 raise ValueError( 

1238 f"Argument `validate_weights_and_indices` must be a boolean. " 

1239 f"Received: {validate_weights_and_indices}") 

1240 

1241 self.validate_weights_and_indices = validate_weights_and_indices 

1242 

1243 def __repr__(self): 

1244 return ("FeatureConfig(table={table!r}, " 

1245 "max_sequence_length={max_sequence_length!r}, " 

1246 "validate_weights_and_indices={validate_weights_and_indices!r}, " 

1247 "output_shape={output_shape!r}, name={name!r})".format( 

1248 table=self.table, 

1249 max_sequence_length=self.max_sequence_length, 

1250 validate_weights_and_indices=self.validate_weights_and_indices, 

1251 output_shape=self.output_shape, 

1252 name=self.name)) 

1253 

1254 

1255def log_tpu_embedding_configuration( 

1256 config: tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration) -> None: 

1257 """Logs a TPUEmbeddingConfiguration proto across multiple statements. 

1258 

1259 Args: 

1260 config: TPUEmbeddingConfiguration proto to log. Necessary because 

1261 logging.info has a maximum length to each log statement, which 

1262 particularly large configs can exceed. 

1263 """ 

1264 logging.info("Beginning log of TPUEmbeddingConfiguration.") 

1265 for line in str(config).splitlines(): 

1266 logging.info(line) 

1267 logging.info("Done with log of TPUEmbeddingConfiguration.") 

1268 

1269 

1270def _sort_device_spec_strings(device_strings: Iterable[str]) -> List[str]: 

1271 sorted_specs = sorted( 

1272 (device_spec.DeviceSpecV2.from_string(spec) for spec in device_strings), 

1273 key=lambda s: (s.replica, s.task, s.device_index), 

1274 ) 

1275 return [spec.to_string() for spec in sorted_specs] 

1276 

1277 

1278def get_list_of_hosts(strategy: tpu_strategy.TPUStrategy) -> List[Text]: 

1279 """Returns a sorted list of CPU devices for the remote jobs. 

1280 

1281 Args: 

1282 strategy: A TPUStrategy object. 

1283 

1284 Returns: 

1285 A sorted list of device host strings. 

1286 """ 

1287 

1288 list_of_hosts = [] 

1289 # Elsewehere we assume that the list of hosts is sorted. 

1290 for tpu_device in _sort_device_spec_strings(strategy.extended.worker_devices): 

1291 host = device_util.get_host_for_device(tpu_device) 

1292 if host not in list_of_hosts: 

1293 list_of_hosts.append(host) 

1294 assert len(list_of_hosts) == strategy.extended.num_hosts 

1295 return list_of_hosts