Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/nn

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# =============================================================================

15"""Implementation of Neural Net (NN) functions."""

17import math

19from tensorflow.python.distribute import distribute_lib

20from tensorflow.python.framework import constant_op

21from tensorflow.python.framework import dtypes

22from tensorflow.python.framework import ops

23from tensorflow.python.ops import array_ops

24from tensorflow.python.ops import array_ops_stack

25from tensorflow.python.ops import candidate_sampling_ops

26from tensorflow.python.ops import check_ops

27from tensorflow.python.ops import cond as tf_cond

28from tensorflow.python.ops import custom_gradient

29from tensorflow.python.ops import embedding_ops

30from tensorflow.python.ops import gen_array_ops # pylint: disable=unused-import

31from tensorflow.python.ops import gen_nn_ops

32from tensorflow.python.ops import gen_sparse_ops

33from tensorflow.python.ops import linalg_ops

34from tensorflow.python.ops import math_ops

35from tensorflow.python.ops import nn_ops

36from tensorflow.python.ops import variables

37from tensorflow.python.ops.losses import util as losses_util

38from tensorflow.python.platform import device_context

39from tensorflow.python.util import dispatch

40from tensorflow.python.util.deprecation import deprecated_args

41from tensorflow.python.util.deprecation import deprecated_argument_lookup

42from tensorflow.python.util.tf_export import tf_export

45@tf_export("nn.log_poisson_loss")

46@dispatch.add_dispatch_support

47def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):

48 """Computes log Poisson loss given `log_input`.

50 Gives the log-likelihood loss between the prediction and the target under the

51 assumption that the target has a Poisson distribution.

52 Caveat: By default, this is not the exact loss, but the loss minus a

53 constant term [log(z!)]. That has no effect for optimization, but

54 does not play well with relative loss comparisons. To compute an

55 approximation of the log factorial term, specify

56 compute_full_loss=True to enable Stirling's Approximation.

58 For brevity, let `c = log(x) = log_input`, `z = targets`. The log Poisson

59 loss is

61 -log(exp(-x) * (x^z) / z!)

62 = -log(exp(-x) * (x^z)) + log(z!)

63 ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

64 [ Note the second term is the Stirling's Approximation for log(z!).

65 It is invariant to x and does not affect optimization, though

66 important for correct relative loss comparisons. It is only

67 computed when compute_full_loss == True. ]

68 = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

69 = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

71 Args:

72 targets: A `Tensor` of the same type and shape as `log_input`.

73 log_input: A `Tensor` of type `float32` or `float64`.

74 compute_full_loss: whether to compute the full loss. If false, a constant

75 term is dropped in favor of more efficient optimization.

76 name: A name for the operation (optional).

78 Returns:

79 A `Tensor` of the same shape as `log_input` with the componentwise

80 logistic losses.

82 Raises:

83 ValueError: If `log_input` and `targets` do not have the same shape.

84 """

85 with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name:

86 log_input = ops.convert_to_tensor(log_input, name="log_input")

87 targets = ops.convert_to_tensor(targets, name="targets")

88 try:

89 targets.get_shape().assert_is_compatible_with(log_input.get_shape())

90 except ValueError:

91 raise ValueError(

92 "`log_input` and `targets` must have the same shape, received "

93 f"({log_input.get_shape()} vs {targets.get_shape()}).")

95 result = math_ops.exp(log_input) - log_input * targets

96 if compute_full_loss:

97 # need to create constant tensors here so that their dtypes can be matched

98 # to that of the targets.

99 point_five = constant_op.constant(0.5, dtype=targets.dtype)

100 two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)

101

102 stirling_approx = (targets * math_ops.log(targets)) - targets + (

103 point_five * math_ops.log(two_pi * targets))

104 zeros = array_ops.zeros_like(targets, dtype=targets.dtype)

105 ones = array_ops.ones_like(targets, dtype=targets.dtype)

106 cond = math_ops.logical_and(targets >= zeros, targets <= ones)

107 result += array_ops.where(cond, zeros, stirling_approx)

108 return result

109

110

111@tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])

112@dispatch.add_dispatch_support

113def sigmoid_cross_entropy_with_logits(

114 labels=None,

115 logits=None,

116 name=None):

117 """See sigmoid_cross_entropy_with_logits_v2."""

118 # pylint: disable=protected-access

119 nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", labels, logits)

120 # pylint: enable=protected-access

121

122 with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:

123 logits = ops.convert_to_tensor(logits, name="logits")

124 labels = ops.convert_to_tensor(labels, name="labels")

125 try:

126 labels.get_shape().assert_is_compatible_with(logits.get_shape())

127 except ValueError:

128 raise ValueError("`logits` and `labels` must have the same shape, "

129 f"received ({logits.get_shape()} vs "

130 f"{labels.get_shape()}).")

131

132 # The logistic loss formula from above is

133 # x - x * z + log(1 + exp(-x))

134 # For x < 0, a more numerically stable formula is

135 # -x * z + log(1 + exp(x))

136 # Note that these two expressions can be combined into the following:

137 # max(x, 0) - x * z + log(1 + exp(-abs(x)))

138 # To allow computing gradients at zero, we define custom versions of max and

139 # abs functions.

140 zeros = array_ops.zeros_like(logits, dtype=logits.dtype)

141 cond = (logits >= zeros)

142 relu_logits = array_ops.where(cond, logits, zeros)

143 neg_abs_logits = array_ops.where(cond, -logits, logits) # pylint: disable=invalid-unary-operand-type

144 return math_ops.add(

145 relu_logits - logits * labels,

146 math_ops.log1p(math_ops.exp(neg_abs_logits)),

147 name=name)

148

149

150# Note: intentionally calling this v2 to not allow existing code with indirect

151# imports to ignore the sentinel behavior.

152@tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])

153@dispatch.register_binary_elementwise_api

154@dispatch.add_dispatch_support

155def sigmoid_cross_entropy_with_logits_v2( # pylint: disable=invalid-name

156 labels=None,

157 logits=None,

158 name=None):

159 r"""Computes sigmoid cross entropy given `logits`.

160

161 Measures the probability error in tasks with two outcomes in which each

162 outcome is independent and need not have a fully certain label. For instance,

163 one could perform a regression where the probability of an event happening is

164 known and used as a label. This loss may also be used for binary

165 classification, where labels are either zero or one.

166

167 For brevity, let `x = logits`, `z = labels`. The logistic loss is

168

169 z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))

170 = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))

171 = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))

172 = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))

173 = (1 - z) * x + log(1 + exp(-x))

174 = x - x * z + log(1 + exp(-x))

175

176 For x < 0, to avoid overflow in exp(-x), we reformulate the above

177

178 x - x * z + log(1 + exp(-x))

179 = log(exp(x)) - x * z + log(1 + exp(-x))

180 = - x * z + log(1 + exp(x))

181

182 Hence, to ensure stability and avoid overflow, the implementation uses this

183 equivalent formulation

184

185 max(x, 0) - x * z + log(1 + exp(-abs(x)))

186

187 `logits` and `labels` must have the same type and shape.

188

189 >>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])

190 >>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])

191 >>> tf.nn.sigmoid_cross_entropy_with_logits(

192 ... labels=labels, logits=logits).numpy()

193 array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,

194 0.6931472], dtype=float32)

195

196 Compared to the losses which handle multiple outcomes,

197 `tf.nn.softmax_cross_entropy_with_logits` for general multi-class

198 classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more

199 efficient multi-class classification with hard labels,

200 `sigmoid_cross_entropy_with_logits` is a slight simplification for binary

201 classification:

202

203 sigmoid(x) = softmax([x, 0])[0]

204

205 $$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$

206

207 While `sigmoid_cross_entropy_with_logits` works for soft binary labels

208 (probabilities between 0 and 1), it can also be used for binary classification

209 where the labels are hard. There is an equivalence between all three symbols

210 in this case, with a probability 0 indicating the second class or 1 indicating

211 the first class:

212

213 >>> sigmoid_logits = tf.constant([1., -1., 0.])

214 >>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],

215 ... axis=-1)

216 >>> soft_binary_labels = tf.constant([1., 1., 0.])

217 >>> soft_multiclass_labels = tf.stack(

218 ... [soft_binary_labels, 1. - soft_binary_labels], axis=-1)

219 >>> hard_labels = tf.constant([0, 0, 1])

220 >>> tf.nn.sparse_softmax_cross_entropy_with_logits(

221 ... labels=hard_labels, logits=softmax_logits).numpy()

222 array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)

223 >>> tf.nn.softmax_cross_entropy_with_logits(

224 ... labels=soft_multiclass_labels, logits=softmax_logits).numpy()

225 array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

226 >>> tf.nn.sigmoid_cross_entropy_with_logits(

227 ... labels=soft_binary_labels, logits=sigmoid_logits).numpy()

228 array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

229

230 Args:

231 labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,

232 inclusive.

233 logits: A `Tensor` of type `float32` or `float64`. Any real number.

234 name: A name for the operation (optional).

235

236 Returns:

237 A `Tensor` of the same shape as `logits` with the componentwise

238 logistic losses.

239

240 Raises:

241 ValueError: If `logits` and `labels` do not have the same shape.

242 """

243 return sigmoid_cross_entropy_with_logits(

244 logits=logits, labels=labels, name=name)

245

246

247sigmoid_cross_entropy_with_logits.__doc__ = (

248 sigmoid_cross_entropy_with_logits_v2.__doc__)

249

250

251@tf_export("nn.weighted_cross_entropy_with_logits", v1=[])

252@dispatch.add_dispatch_support

253def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,

254 name=None):

255 """Computes a weighted cross entropy.

256

257 This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,

258 allows one to trade off recall and precision by up- or down-weighting the

259 cost of a positive error relative to a negative error.

260

261 The usual cross-entropy cost is defined as:

262

263 labels * -log(sigmoid(logits)) +

264 (1 - labels) * -log(1 - sigmoid(logits))

265

266 A value `pos_weight > 1` decreases the false negative count, hence increasing

267 the recall.

268 Conversely setting `pos_weight < 1` decreases the false positive count and

269 increases the precision.

270 This can be seen from the fact that `pos_weight` is introduced as a

271 multiplicative coefficient for the positive labels term

272 in the loss expression:

273

274 labels * -log(sigmoid(logits)) * pos_weight +

275 (1 - labels) * -log(1 - sigmoid(logits))

276

277 For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.

278 The loss is:

279

280 qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))

281 = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))

282 = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))

283 = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))

284 = (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x))

285 = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

286

287 Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,

288 the implementation uses

289

290 (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

291

292 `logits` and `labels` must have the same type and shape.

293

294 >>> labels = tf.constant([1., 0.5, 0.])

295 >>> logits = tf.constant([1.5, -0.1, -10.])

296 >>> tf.nn.weighted_cross_entropy_with_logits(

297 ... labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()

298 array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)

299 >>> tf.nn.weighted_cross_entropy_with_logits(

300 ... labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()

301 array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)

302

303 Args:

304 labels: A `Tensor` of the same type and shape as `logits`, with values

305 between 0 and 1 inclusive.

306 logits: A `Tensor` of type `float32` or `float64`, any real numbers.

307 pos_weight: A coefficient to use on the positive examples, typically a

308 scalar but otherwise broadcastable to the shape of `logits`. Its value

309 should be non-negative.

310 name: A name for the operation (optional).

311

312 Returns:

313 A `Tensor` of the same shape as `logits` with the componentwise

314 weighted logistic losses.

315

316 Raises:

317 ValueError: If `logits` and `labels` do not have the same shape.

318 """

319 with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:

320 logits = ops.convert_to_tensor(logits, name="logits")

321 labels = ops.convert_to_tensor(labels, name="labels")

322 try:

323 labels.get_shape().assert_is_compatible_with(logits.get_shape())

324 except ValueError:

325 raise ValueError("`logits` and `labels` must have the same shape, "

326 f"received ({logits.get_shape()} vs "

327 f"{labels.get_shape()}).")

328

329 # The logistic loss formula from above is

330 # (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

331 # For x < 0, a more numerically stable formula is

332 # (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(x)) - l * x

333 # To avoid branching, we use the combined version

334 # (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

335 log_weight = 1 + (pos_weight - 1) * labels

336 return math_ops.add(

337 (1 - labels) * logits,

338 log_weight * (math_ops.log1p(math_ops.exp(-math_ops.abs(logits))) +

339 nn_ops.relu(-logits)), # pylint: disable=invalid-unary-operand-type

340 name=name)

341

342

343@tf_export(v1=["nn.weighted_cross_entropy_with_logits"])

344@dispatch.add_dispatch_support

345@deprecated_args(None, "targets is deprecated, use labels instead", "targets")

346def weighted_cross_entropy_with_logits(labels=None,

347 logits=None,

348 pos_weight=None,

349 name=None,

350 targets=None):

351 """Computes a weighted cross entropy.

352

353 This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,

354 allows one to trade off recall and precision by up- or down-weighting the

355 cost of a positive error relative to a negative error.

356

357 The usual cross-entropy cost is defined as:

358

359 labels * -log(sigmoid(logits)) +

360 (1 - labels) * -log(1 - sigmoid(logits))

361

362 A value `pos_weight > 1` decreases the false negative count, hence increasing

363 the recall.

364 Conversely setting `pos_weight < 1` decreases the false positive count and

365 increases the precision.

366 This can be seen from the fact that `pos_weight` is introduced as a

367 multiplicative coefficient for the positive labels term

368 in the loss expression:

369

370 labels * -log(sigmoid(logits)) * pos_weight +

371 (1 - labels) * -log(1 - sigmoid(logits))

372

373 For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.

374 The loss is:

375

376 qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))

377 = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))

378 = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))

379 = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))

380 = (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x))

381 = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

382

383 Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,

384 the implementation uses

385

386 (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

387

388 `logits` and `labels` must have the same type and shape.

389

390 Args:

391 labels: A `Tensor` of the same type and shape as `logits`.

392 logits: A `Tensor` of type `float32` or `float64`.

393 pos_weight: A coefficient to use on the positive examples.

394 name: A name for the operation (optional).

395 targets: Deprecated alias for labels.

396

397 Returns:

398 A `Tensor` of the same shape as `logits` with the componentwise

399 weighted logistic losses.

400

401 Raises:

402 ValueError: If `logits` and `labels` do not have the same shape.

403 """

404 labels = deprecated_argument_lookup("labels", labels, "targets", targets)

405 return weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight, name)

406

407

408@tf_export("nn.compute_average_loss")

409@dispatch.add_dispatch_support

410def compute_average_loss(per_example_loss,

411 sample_weight=None,

412 global_batch_size=None):

413 """Scales per-example losses with sample_weights and computes their average.

414

415 Usage with distribution strategy and custom training loop:

416

417 ```python

418 with strategy.scope():

419 def compute_loss(labels, predictions, sample_weight=None):

420

421 # If you are using a `Loss` class instead, set reduction to `NONE` so that

422 # we can do the reduction afterwards and divide by global batch size.

423 per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(

424 labels, predictions)

425

426 # Compute loss that is scaled by sample_weight and by global batch size.

427 return tf.nn.compute_average_loss(

428 per_example_loss,

429 sample_weight=sample_weight,

430 global_batch_size=GLOBAL_BATCH_SIZE)

431 ```

432

433 Args:

434 per_example_loss: Per-example loss.

435 sample_weight: Optional weighting for each example.

436 global_batch_size: Optional global batch size value. Defaults to (size of

437 first dimension of `losses`) * (number of replicas).

438

439 Returns:

440 Scalar loss value, obtained by summing the `per_example_loss` and dividing

441 by `global_batch_size`. If `global_batch_size` is zero, the result is zero.

442 """ # pylint: disable=g-doc-exception

443 per_example_loss = ops.convert_to_tensor(per_example_loss)

444 input_dtype = per_example_loss.dtype

445

446 with losses_util.check_per_example_loss_rank(per_example_loss):

447 if sample_weight is not None:

448 sample_weight = ops.convert_to_tensor(sample_weight)

449 per_example_loss = losses_util.scale_losses_by_sample_weight(

450 per_example_loss, sample_weight)

451 per_example_loss = math_ops.cast(per_example_loss, input_dtype)

452

453 if global_batch_size is None:

454 if (distribute_lib.has_strategy()

455 and distribute_lib.in_cross_replica_context()):

456 raise RuntimeError(

457 "You are calling `compute_average_loss` in cross replica context, "

458 "while it was expected to be called in replica context.")

459

460 num_replicas = distribute_lib.get_strategy().num_replicas_in_sync

461 per_replica_batch_size = array_ops.shape_v2(per_example_loss)[0]

462 global_batch_size = per_replica_batch_size * num_replicas

463

464 check_ops.assert_scalar_v2(

465 global_batch_size, message="global_batch_size must be scalar.")

466 check_ops.assert_integer_v2(

467 global_batch_size,

468 message="global_batch_size must be an integer.")

469 check_ops.assert_non_negative_v2(

470 global_batch_size, message="global_batch_size must be non-negative.")

471

472 loss = math_ops.reduce_sum(per_example_loss)

473 global_batch_size = math_ops.cast(global_batch_size, input_dtype)

474 return math_ops.div_no_nan(loss, global_batch_size)

475

476

477@tf_export("nn.scale_regularization_loss")

478@dispatch.add_dispatch_support

479def scale_regularization_loss(regularization_loss):

480 """Scales the sum of the given regularization losses by number of replicas.

481

482 Usage with distribution strategy and custom training loop:

483

484 ```python

485 with strategy.scope():

486 def compute_loss(self, label, predictions):

487 per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(

488 labels, predictions)

489

490 # Compute loss that is scaled by sample_weight and by global batch size.

491 loss = tf.nn.compute_average_loss(

492 per_example_loss,

493 sample_weight=sample_weight,

494 global_batch_size=GLOBAL_BATCH_SIZE)

495

496 # Add scaled regularization losses.

497 loss += tf.nn.scale_regularization_loss(tf.nn.l2_loss(weights))

498 return loss

499 ```

500

501 Args:

502 regularization_loss: Regularization loss.

503

504 Returns:

505 Scalar loss value.

506 """ # pylint: disable=g-doc-exception

507 if (distribute_lib.has_strategy()

508 and distribute_lib.in_cross_replica_context()):

509 raise RuntimeError(

510 "You are calling `scale_regularization_loss` in cross replica context, "

511 "while it was expected to be called in replica context.")

512

513 num_replicas = distribute_lib.get_strategy().num_replicas_in_sync

514 return math_ops.reduce_sum(regularization_loss) / num_replicas

515

516

517@tf_export(v1=["nn.relu_layer"])

518@dispatch.add_dispatch_support

519def relu_layer(x, weights, biases, name=None):

520 """Computes Relu(x * weight + biases).

521

522 Args:

523 x: a 2D tensor. Dimensions typically: batch, in_units

524 weights: a 2D tensor. Dimensions typically: in_units, out_units

525 biases: a 1D tensor. Dimensions: out_units

526 name: A name for the operation (optional). If not specified

527 "nn_relu_layer" is used.

528

529 Returns:

530 A 2-D Tensor computing relu(matmul(x, weights) + biases).

531 Dimensions typically: batch, out_units.

532 """

533 with ops.name_scope(name, "relu_layer", [x, weights, biases]) as name:

534 x = ops.convert_to_tensor(x, name="x")

535 weights = ops.convert_to_tensor(weights, name="weights")

536 biases = ops.convert_to_tensor(biases, name="biases")

537 xw_plus_b = nn_ops.bias_add(math_ops.matmul(x, weights), biases)

538 return nn_ops.relu(xw_plus_b, name=name)

539

540

541@tf_export("nn.silu", "nn.swish")

542@dispatch.register_unary_elementwise_api

543@dispatch.add_dispatch_support

544def swish(features, beta=1.0):

545 # pylint: disable=g-doc-args

546 """Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.

547

548 beta : Hyperparameter for Swish activation function. Default value 1.0.

549

550 The SiLU activation function was introduced in "Gaussian Error Linear Units

551 (GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and

552 "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in

553 Reinforcement Learning"

554 [Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently

555 discovered (and called swish) in "Searching for Activation Functions"

556 [Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)

557

558 Args:

559 features: A `Tensor` representing preactivation values.

560 beta: A 'Tensor' representing value of beta hyperparameter.

561

562 Returns:

563 The activation value.

564 """

565 # pylint: enable=g-doc-args

566 features = ops.convert_to_tensor(features, name="features")

567 beta = ops.convert_to_tensor(beta, name="beta")

568 beta = math_ops.cast(beta, features.dtype)

569

570 @custom_gradient.custom_gradient

571 def swish_impl(features, beta):

572

573 def grad(dy):

574 """Gradient for the Swish activation function."""

575 # Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x)

576 # around for backprop, effectively doubling the tensor's memory

577 # consumption. We use a control dependency here so that sigmoid(features)

578 # is re-computed during backprop (the control dep prevents it being

579 # de-duped with the forward pass) and we can free the sigmoid(features)

580 # expression immediately after use during the forward pass.

581 with ops.control_dependencies([dy]):

582 sigmoid_features = math_ops.sigmoid(beta * features)

583

584 activation_grad = (

585 sigmoid_features * (1.0 + (beta * features) *

586 (1.0 - sigmoid_features)))

587 beta_grad = math_ops.reduce_sum(

588 dy * math_ops.square(features) * sigmoid_features *

589 (1.0 - sigmoid_features))

590 return (dy * activation_grad, beta_grad)

591

592 return features * math_ops.sigmoid(beta * features), grad

593

594 return swish_impl(features, beta)

595

596

597# pylint: disable=redefined-builtin

598@tf_export("linalg.normalize")

599@dispatch.add_dispatch_support

600def normalize(tensor, ord="euclidean", axis=None, name=None):

601 """Normalizes `tensor` along dimension `axis` using specified norm.

602

603 This uses `tf.linalg.norm` to compute the norm along `axis`.

604

605 This function can compute several different vector norms (the 1-norm, the

606 Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and

607 matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).

608

609 Args:

610 tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`

611 ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,

612 `2`, `np.inf` and any positive real number yielding the corresponding

613 p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if

614 `tensor` is a matrix and equivalent to 2-norm for vectors.

615 Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for

616 vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,

617 '`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`

618 on how to compute norms for a batch of vectors or matrices stored in a

619 tensor.

620 axis: If `axis` is `None` (the default), the input is considered a vector

621 and a single vector norm is computed over the entire set of values in the

622 tensor, i.e. `norm(tensor, ord=ord)` is equivalent to

623 `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the

624 input is considered a batch of vectors, and `axis` determines the axis in

625 `tensor` over which to compute vector norms. If `axis` is a 2-tuple of

626 Python integers it is considered a batch of matrices and `axis` determines

627 the axes in `tensor` over which to compute a matrix norm.

628 Negative indices are supported. Example: If you are passing a tensor that

629 can be either a matrix or a batch of matrices at runtime, pass

630 `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are

631 computed.

632 name: The name of the op.

633

634 Returns:

635 normalized: A normalized `Tensor` with the same shape as `tensor`.

636 norm: The computed norms with the same shape and dtype `tensor` but the

637 final axis is 1 instead. Same as running

638 `tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.

639

640 Raises:

641 ValueError: If `ord` or `axis` is invalid.

642 """

643 with ops.name_scope(name, "normalize", [tensor]) as name:

644 tensor = ops.convert_to_tensor(tensor)

645 norm = linalg_ops.norm(tensor, ord, axis, keepdims=True)

646 norm = math_ops.cast(norm, tensor.dtype)

647 normalized = tensor / norm

648 return normalized, norm

649

650

651@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize",

652 v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])

653@dispatch.add_dispatch_support

654@deprecated_args(None, "dim is deprecated, use axis instead", "dim")

655def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):

656 """Normalizes along dimension `axis` using an L2 norm.

657

658 For a 1-D tensor with `axis = 0`, computes

659

660 output = x / sqrt(max(sum(x**2), epsilon))

661

662 For `x` with more dimensions, independently normalizes each 1-D slice along

663 dimension `axis`.

664

665 1-D tensor example:

666 >>> x = tf.constant([3.0, 4.0])

667 >>> tf.math.l2_normalize(x).numpy()

668 array([0.6, 0.8], dtype=float32)

669

670 2-D tensor example:

671 >>> x = tf.constant([[3.0], [4.0]])

672 >>> tf.math.l2_normalize(x, 0).numpy()

673 array([[0.6],

674 [0.8]], dtype=float32)

675

676 >>> x = tf.constant([[3.0], [4.0]])

677 >>> tf.math.l2_normalize(x, 1).numpy()

678 array([[1.],

679 [1.]], dtype=float32)

680

681 Args:

682 x: A `Tensor`.

683 axis: Dimension along which to normalize. A scalar or a vector of

684 integers.

685 epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the

686 divisor if `norm < sqrt(epsilon)`.

687 name: A name for this operation (optional).

688 dim: Deprecated, do not use.

689

690 Returns:

691 A `Tensor` with the same shape as `x`.

692 """

693 axis = deprecated_argument_lookup("axis", axis, "dim", dim)

694 with ops.name_scope(name, "l2_normalize", [x]) as name:

695 x = ops.convert_to_tensor(x, name="x")

696 if x.dtype.is_complex:

697 square_real = math_ops.square(math_ops.real(x))

698 square_imag = math_ops.square(math_ops.imag(x))

699 square_sum = math_ops.real(

700 math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))

701 x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))

702 norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)

703 norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)

704 return math_ops.complex(norm_real, norm_imag, name=name)

705 square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)

706 x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))

707 return math_ops.multiply(x, x_inv_norm, name=name)

708

709

710def _count_nonzero(input_tensor, dtype=dtypes.int64):

711 """Same as math_ops.count_nonzero.

712

713 The reduction is done in dtype, which can be faster for 32-bit dtypes.

714

715 Args:

716 input_tensor: numeric tensor

717 dtype: reduction dtype

718

719 Returns:

720 number of nonzero values with type dtype

721 """

722 with ops.name_scope("count_nonzero", values=[input_tensor]):

723 zero = array_ops.zeros([], dtype=input_tensor.dtype)

724 nonzero_count = math_ops.reduce_sum(

725 math_ops.cast(

726 math_ops.not_equal(input_tensor, zero),

727 dtype=dtype), name="nonzero_count")

728 return nonzero_count

729

730

731@tf_export("math.zero_fraction", "nn.zero_fraction")

732@dispatch.add_dispatch_support

733def zero_fraction(value, name=None):

734 """Returns the fraction of zeros in `value`.

735

736 If `value` is empty, the result is `nan`.

737

738 This is useful in summaries to measure and report sparsity. For example,

739

740 ```python

741 z = tf.nn.relu(...)

742 summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))

743 ```

744

745 Args:

746 value: A tensor of numeric type.

747 name: A name for the operation (optional).

748

749 Returns:

750 The fraction of zeros in `value`, with type `float32`.

751 """

752 with ops.name_scope(name, "zero_fraction", [value]):

753 value = ops.convert_to_tensor(value, name="value")

754 size = array_ops.size(value, out_type=dtypes.int64)

755 # If the count is small, we can save memory/CPU with an int32 reduction.

756 num_nonzero = tf_cond.cond(

757 size <= dtypes.int32.max,

758 # pylint: disable=g-long-lambda

759 true_fn=lambda: math_ops.cast(

760 _count_nonzero(value, dtype=dtypes.int32),

761 dtype=dtypes.int64),

762 false_fn=lambda: _count_nonzero(value, dtype=dtypes.int64))

763

764 with ops.name_scope("counts_to_fraction"):

765 num_zero = size - num_nonzero

766 num_zero_float32 = math_ops.cast(num_zero, dtype=dtypes.float32)

767 size_float32 = math_ops.cast(size, dtype=dtypes.float32)

768 zero_fraction_float32 = num_zero_float32 / size_float32

769

770 return array_ops.identity(zero_fraction_float32, "fraction")

771

772

773# pylint: disable=redefined-builtin

774@tf_export(v1=["nn.depthwise_conv2d"])

775@dispatch.add_dispatch_support

776def depthwise_conv2d(input,

777 filter,

778 strides,

779 padding,

780 rate=None,

781 name=None,

782 data_format=None,

783 dilations=None):

784 """Depthwise 2-D convolution.

785

786 Given a 4D input tensor ('NHWC' or 'NCHW' data formats)

787 and a filter tensor of shape

788 `[filter_height, filter_width, in_channels, channel_multiplier]`

789 containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`

790 applies a different filter to each input channel (expanding from 1 channel

791 to `channel_multiplier` channels for each), then concatenates the results

792 together. The output has `in_channels * channel_multiplier` channels.

793

794 In detail, with the default NHWC format,

795

796 output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}

797 filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,

798 strides[2] * j + rate[1] * dj, k]

799

800 Must have `strides[0] = strides[3] = 1`. For the most common case of the

801 same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

802 If any value in `rate` is greater than 1, we perform atrous depthwise

803 convolution, in which case all values in the `strides` tensor must be equal

804 to 1.

805

806 Usage Example:

807

808 >>> x = np.array([

809 ... [1., 2.],

810 ... [3., 4.],

811 ... [5., 6.]

812 ... ], dtype=np.float32).reshape((1, 3, 2, 1))

813 >>> kernel = np.array([

814 ... [1., 2.],

815 ... [3., 4]

816 ... ], dtype=np.float32).reshape((2, 1, 1, 2))

817 >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],

818 ... padding='VALID').numpy()

819 array([[[[10., 14.],

820 [14., 20.]],

821 [[18., 26.],

822 [22., 32.]]]], dtype=float32)

823

824 >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],

825 ... padding=[[0, 0], [1, 0], [1, 0], [0, 0]]

826 ... ).numpy()

827 array([[[[ 0., 0.],

828 [ 3., 4.],

829 [ 6., 8.]],

830 [[ 0., 0.],

831 [10., 14.],

832 [14., 20.]],

833 [[ 0., 0.],

834 [18., 26.],

835 [22., 32.]]]], dtype=float32)

836

837 Args:

838 input: 4-D with shape according to `data_format`.

839 filter: 4-D with shape

840 `[filter_height, filter_width, in_channels, channel_multiplier]`.

841 strides: 1-D of size 4. The stride of the sliding window for each

842 dimension of `input`.

843 padding: Controls how to pad the image before applying the convolution. Can

844 be the string `"SAME"` or `"VALID"` indicating the type of padding

845 algorithm to use, or a list indicating the explicit paddings at the start

846 and end of each dimension. When explicit padding is used and data_format

847 is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],

848 [pad_left, pad_right], [0, 0]]`. When explicit padding used and

849 data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],

850 [pad_top, pad_bottom], [pad_left, pad_right]]`.

851 rate: 1-D of size 2. The dilation rate in which we sample input values

852 across the `height` and `width` dimensions in atrous convolution. If it is

853 greater than 1, then all values of strides must be 1.

854 name: A name for this operation (optional).

855 data_format: The data format for input. Either "NHWC" (default) or "NCHW".

856 dilations: Alias of rate.

857

858 Returns:

859 A 4-D `Tensor` with shape according to `data_format`. E.g., for

860 "NHWC" format, shape is

861 `[batch, out_height, out_width, in_channels * channel_multiplier].`

862 """

863 rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)

864 with ops.name_scope(name, "depthwise", [input, filter]) as name:

865 input = ops.convert_to_tensor(input, name="tensor_in")

866 filter = ops.convert_to_tensor(filter, name="filter_in")

867 if rate is None:

868 rate = [1, 1]

869

870 # Use depthwise_conv2d_native if executing on TPU.

871 if device_context.enclosing_tpu_context() is not None:

872 if data_format == "NCHW":

873 dilations = [1, 1, rate[0], rate[1]]

874 else:

875 dilations = [1, rate[0], rate[1], 1]

876 return nn_ops.depthwise_conv2d_native(

877 input=input,

878 filter=filter,

879 strides=strides,

880 padding=padding,

881 data_format=data_format,

882 dilations=dilations,

883 name=name)

884

885 def op(input_converted, _, padding):

886 return nn_ops.depthwise_conv2d_native(

887 input=input_converted,

888 filter=filter,

889 strides=strides,

890 padding=padding,

891 data_format=data_format,

892 name=name)

893

894 return nn_ops.with_space_to_batch(

895 input=input,

896 filter_shape=array_ops.shape(filter),

897 dilation_rate=rate,

898 padding=padding,

899 data_format=data_format,

900 op=op)

901

902

903@tf_export("nn.depthwise_conv2d", v1=[])

904@dispatch.add_dispatch_support

905def depthwise_conv2d_v2(input,

906 filter,

907 strides,

908 padding,

909 data_format=None,

910 dilations=None,

911 name=None):

912 """Depthwise 2-D convolution.

913

914 Given a 4D input tensor ('NHWC' or 'NCHW' data formats)

915 and a filter tensor of shape

916 `[filter_height, filter_width, in_channels, channel_multiplier]`

917 containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`

918 applies a different filter to each input channel (expanding from 1 channel

919 to `channel_multiplier` channels for each), then concatenates the results

920 together. The output has `in_channels * channel_multiplier` channels.

921

922 In detail, with the default NHWC format,

923

924 output[b, i, j, k * channel_multiplier + q] =

925 sum_{di, dj} filter[di, dj, k, q] *

926 input[b, strides[1] * i + dilations[0] * di,

927 strides[2] * j + dilations[1] * dj, k]

928

929 Must have `strides[0] = strides[3] = 1`. For the most common case of the

930 same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

931 If any value in `dilations` is greater than 1, we perform atrous depthwise

932 convolution, in which case all values in the `strides` tensor must be equal

933 to 1.

934

935 Usage Example:

936

937 >>> x = np.array([

938 ... [1., 2.],

939 ... [3., 4.],

940 ... [5., 6.]

941 ... ], dtype=np.float32).reshape((1, 3, 2, 1))

942 >>> kernel = np.array([

943 ... [1., 2.],

944 ... [3., 4]

945 ... ], dtype=np.float32).reshape((2, 1, 1, 2))

946 >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],

947 ... padding='VALID').numpy()

948 array([[[[10., 14.],

949 [14., 20.]],

950 [[18., 26.],

951 [22., 32.]]]], dtype=float32)

952

953 >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],

954 ... padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()

955 array([[[[ 0., 0.],

956 [ 3., 4.],

957 [ 6., 8.]],

958 [[ 0., 0.],

959 [10., 14.],

960 [14., 20.]],

961 [[ 0., 0.],

962 [18., 26.],

963 [22., 32.]]]], dtype=float32)

964

965 Args:

966 input: 4-D with shape according to `data_format`.

967 filter: 4-D with shape

968 `[filter_height, filter_width, in_channels, channel_multiplier]`.

969 strides: 1-D of size 4. The stride of the sliding window for each

970 dimension of `input`.

971 padding: Controls how to pad the image before applying the convolution. Can

972 be the string `"SAME"` or `"VALID"` indicating the type of padding

973 algorithm to use, or a list indicating the explicit paddings at the start

974 and end of each dimension. See

975 [here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)

976 for more information. When explicit padding is used and data_format

977 is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],

978 [pad_left, pad_right], [0, 0]]`. When explicit padding used and

979 data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],

980 [pad_top, pad_bottom], [pad_left, pad_right]]`.

981 data_format: The data format for input. Either "NHWC" (default) or "NCHW".

982 dilations: 1-D of size 2. The dilation rate in which we sample input values

983 across the `height` and `width` dimensions in atrous convolution. If it is

984 greater than 1, then all values of strides must be 1.

985 name: A name for this operation (optional).

986

987 Returns:

988 A 4-D `Tensor` with shape according to `data_format`. E.g., for

989 "NHWC" format, shape is

990 `[batch, out_height, out_width, in_channels * channel_multiplier].`

991 """

992 return depthwise_conv2d(input=input,

993 filter=filter,

994 strides=strides,

995 padding=padding,

996 rate=dilations,

997 name=name,

998 data_format=data_format)

999

1000# pylint: enable=redefined-builtin

1001

1002

1003# pylint: disable=redefined-builtin,line-too-long

1004@tf_export(v1=["nn.separable_conv2d"])

1005@dispatch.add_dispatch_support

1006def separable_conv2d(input,

1007 depthwise_filter,

1008 pointwise_filter,

1009 strides,

1010 padding,

1011 rate=None,

1012 name=None,

1013 data_format=None,

1014 dilations=None):

1015 """2-D convolution with separable filters.

1016

1017 Performs a depthwise convolution that acts separately on channels followed by

1018 a pointwise convolution that mixes channels. Note that this is separability

1019 between dimensions `[1, 2]` and `3`, not spatial separability between

1020 dimensions `1` and `2`.

1021

1022 In detail, with the default NHWC format,

1023

1024 output[b, i, j, k] = sum_{di, dj, q, r}

1025 input[b, strides[1] * i + di, strides[2] * j + dj, q] *

1026 depthwise_filter[di, dj, q, r] *

1027 pointwise_filter[0, 0, q * channel_multiplier + r, k]

1028

1029 `strides` controls the strides for the depthwise convolution only, since

1030 the pointwise convolution has implicit strides of `[1, 1, 1, 1]`. Must have

1031 `strides[0] = strides[3] = 1`. For the most common case of the same

1032 horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

1033 If any value in `rate` is greater than 1, we perform atrous depthwise

1034 convolution, in which case all values in the `strides` tensor must be equal

1035 to 1.

1036

1037 Args:

1038 input: 4-D `Tensor` with shape according to `data_format`.

1039 depthwise_filter: 4-D `Tensor` with shape

1040 `[filter_height, filter_width, in_channels, channel_multiplier]`.

1041 Contains `in_channels` convolutional filters of depth 1.

1042 pointwise_filter: 4-D `Tensor` with shape

1043 `[1, 1, channel_multiplier * in_channels, out_channels]`. Pointwise

1044 filter to mix channels after `depthwise_filter` has convolved spatially.

1045 strides: 1-D of size 4. The strides for the depthwise convolution for

1046 each dimension of `input`.

1047 padding: Controls how to pad the image before applying the depthwise

1048 convolution. Can be the string `"SAME"` or `"VALID"` indicating the type

1049 of padding algorithm to use, or a Python list indicating the explicit

1050 paddings at the start and end of each dimension. When explicit padding is

1051 used and data_format is `"NHWC"`, this should be in the form `[[0, 0],

1052 [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit

1053 padding used and data_format is `"NCHW"`, this should be in the form

1054 `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.

1055 rate: 1-D of size 2. The dilation rate in which we sample input values

1056 across the `height` and `width` dimensions in atrous convolution. If it is

1057 greater than 1, then all values of strides must be 1.

1058 name: A name for this operation (optional).

1059 data_format: The data format for input. Either "NHWC" (default) or "NCHW".

1060 dilations: Alias of rate.

1061

1062 Returns:

1063 A 4-D `Tensor` with shape according to 'data_format'. For

1064 example, with data_format="NHWC", shape is [batch, out_height,

1065 out_width, out_channels].

1066 """

1067 rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)

1068 with ops.name_scope(name, "separable_conv2d",

1069 [input, depthwise_filter, pointwise_filter]) as name:

1070 input = ops.convert_to_tensor(input, name="tensor_in")

1071 depthwise_filter = ops.convert_to_tensor(

1072 depthwise_filter, name="depthwise_filter")

1073 pointwise_filter = ops.convert_to_tensor(

1074 pointwise_filter, name="pointwise_filter")

1075

1076 pointwise_filter_shape = pointwise_filter.get_shape().with_rank(4)

1077 pointwise_filter_shape.dims[0].assert_is_compatible_with(1)

1078 pointwise_filter_shape.dims[1].assert_is_compatible_with(1)

1079

1080 if rate is None:

1081 rate = [1, 1]

1082

1083 # The layout of the ops in the graph are expected to be as follows:

1084 # depthwise_conv2d // Conv2D op corresponding to native depthwise conv.

1085 # separable_conv2d // Conv2D op corresponding to the pointwise conv.

1086

1087 def op(input_converted, _, padding):

1088 return nn_ops.depthwise_conv2d_native(

1089 input=input_converted,

1090 filter=depthwise_filter,

1091 strides=strides,

1092 padding=padding,

1093 data_format=data_format,

1094 name="depthwise")

1095

1096 depthwise = nn_ops.with_space_to_batch(

1097 input=input,

1098 filter_shape=array_ops.shape(depthwise_filter),

1099 dilation_rate=rate,

1100 padding=padding,

1101 data_format=data_format,

1102 op=op)

1103

1104 return nn_ops.conv2d(

1105 depthwise,

1106 pointwise_filter, [1, 1, 1, 1],

1107 padding="VALID",

1108 data_format=data_format,

1109 name=name)

1110

1111

1112@tf_export("nn.separable_conv2d", v1=[])

1113@dispatch.add_dispatch_support

1114def separable_conv2d_v2(

1115 input,

1116 depthwise_filter,

1117 pointwise_filter,

1118 strides,

1119 padding,

1120 data_format=None,

1121 dilations=None,

1122 name=None,

1123):

1124 """2-D convolution with separable filters.

1125

1126 Performs a depthwise convolution that acts separately on channels followed by

1127 a pointwise convolution that mixes channels. Note that this is separability

1128 between dimensions `[1, 2]` and `3`, not spatial separability between

1129 dimensions `1` and `2`.

1130

1131 In detail, with the default NHWC format,

1132

1133 output[b, i, j, k] = sum_{di, dj, q, r}

1134 input[b, strides[1] * i + di, strides[2] * j + dj, q] *

1135 depthwise_filter[di, dj, q, r] *

1136 pointwise_filter[0, 0, q * channel_multiplier + r, k]

1137

1138 `strides` controls the strides for the depthwise convolution only, since

1139 the pointwise convolution has implicit strides of `[1, 1, 1, 1]`. Must have

1140 `strides[0] = strides[3] = 1`. For the most common case of the same

1141 horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

1142 If any value in `rate` is greater than 1, we perform atrous depthwise

1143 convolution, in which case all values in the `strides` tensor must be equal

1144 to 1.

1145

1146 Args:

1147 input: 4-D `Tensor` with shape according to `data_format`.

1148 depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,

1149 in_channels, channel_multiplier]`. Contains `in_channels` convolutional

1150 filters of depth 1.

1151 pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *

1152 in_channels, out_channels]`. Pointwise filter to mix channels after

1153 `depthwise_filter` has convolved spatially.

1154 strides: 1-D of size 4. The strides for the depthwise convolution for each

1155 dimension of `input`.

1156 padding: Controls how to pad the image before applying the depthwise

1157 convolution. Can be the string `"SAME"` or `"VALID"` indicating the type

1158 of padding algorithm to use, or a Python list indicating the explicit

1159 paddings at the start and end of each dimension. When explicit padding is

1160 used and data_format is `"NHWC"`, this should be in the form `[[0, 0],

1161 [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit

1162 padding used and data_format is `"NCHW"`, this should be in the form

1163 `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.

1164 data_format: The data format for input. Either "NHWC" (default) or "NCHW".

1165 dilations: 1-D of size 2. The dilation rate in which we sample input values

1166 across the `height` and `width` dimensions in atrous convolution. If it is

1167 greater than 1, then all values of strides must be 1.

1168 name: A name for this operation (optional).

1169

1170 Returns:

1171 A 4-D `Tensor` with shape according to 'data_format'. For

1172 example, with data_format="NHWC", shape is [batch, out_height,

1173 out_width, out_channels].

1174 """

1175 return separable_conv2d(

1176 input,

1177 depthwise_filter,

1178 pointwise_filter,

1179 strides,

1180 padding,

1181 rate=dilations,

1182 name=name,

1183 data_format=data_format)

1184

1185# pylint: enable=redefined-builtin,line-too-long

1186

1187

1188@tf_export(v1=["nn.sufficient_statistics"])

1189@dispatch.add_dispatch_support

1190def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,

1191 keepdims=None):

1192 """Calculate the sufficient statistics for the mean and variance of `x`.

1193

1194 These sufficient statistics are computed using the one pass algorithm on

1195 an input that's optionally shifted. See:

1196 https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

1197

1198 For example:

1199 >>> t = [[1, 2, 3], [4, 5, 6]]

1200 >>> sufficient_statistics(t, [1])

1201 (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),

1202 dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),

1203 dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

1204 >>> sufficient_statistics(t, [-1])

1205 (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),

1206 dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),

1207 dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

1208

1209 Args:

1210 x: A `Tensor`.

1211 axes: Array of ints. Axes along which to compute mean and variance. As in

1212 Python, the axes can also be negative numbers. A negative axis is

1213 interpreted as counting from the end of the rank, i.e., axis +

1214 rank(values)-th dimension.

1215 shift: A `Tensor` containing the value by which to shift the data for

1216 numerical stability, or `None` if no shift is to be performed. A shift

1217 close to the true mean provides the most numerically stable results.

1218 keep_dims: produce statistics with the same dimensionality as the input.

1219 name: Name used to scope the operations that compute the sufficient stats.

1220 keepdims: Alias for keep_dims.

1221

1222 Returns:

1223 Four `Tensor` objects of the same type as `x`:

1224

1225 * the count (number of elements to average over).

1226 * the (possibly shifted) sum of the elements in the array.

1227 * the (possibly shifted) sum of squares of the elements in the array.

1228 * the shift by which the mean must be corrected or None if `shift` is None.

1229 """

1230 axes = list(set(axes))

1231 keep_dims = deprecated_argument_lookup(

1232 "keepdims", keepdims, "keep_dims", keep_dims)

1233 if keep_dims is None:

1234 keep_dims = False

1235 with ops.name_scope(name, "sufficient_statistics", [x, shift]):

1236 x = ops.convert_to_tensor(x, name="x")

1237 x_shape = x.get_shape()

1238 if x_shape.rank is not None and all(

1239 x_shape.dims[d].value is not None for d in axes):

1240 counts = 1

1241 for d in axes:

1242 counts *= x_shape.dims[d].value

1243 counts = constant_op.constant(counts, dtype=x.dtype)

1244 else: # shape needs to be inferred at runtime.

1245 # Normalize axes to be positive. Required for gather.

1246 rank = array_ops.rank(x)

1247 positive_axes = [axis + rank if axis < 0 else axis for axis in axes]

1248 x_dims = array_ops.gather(

1249 math_ops.cast(array_ops.shape(x), x.dtype), positive_axes)

1250 counts = math_ops.reduce_prod(x_dims, name="count")

1251 if shift is not None:

1252 shift = ops.convert_to_tensor(shift, name="shift")

1253 m_ss = math_ops.subtract(x, shift)

1254 v_ss = math_ops.squared_difference(x, shift)

1255 else: # no shift.

1256 m_ss = x

1257 v_ss = math_ops.square(x)

1258 m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")

1259 v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")

1260 return counts, m_ss, v_ss, shift

1261

1262

1263@tf_export("nn.sufficient_statistics", v1=[])

1264@dispatch.add_dispatch_support

1265def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):

1266 """Calculate the sufficient statistics for the mean and variance of `x`.

1267

1268 These sufficient statistics are computed using the one pass algorithm on

1269 an input that's optionally shifted. See:

1270 https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

1271

1272 Args:

1273 x: A `Tensor`.

1274 axes: Array of ints. Axes along which to compute mean and variance.

1275 shift: A `Tensor` containing the value by which to shift the data for

1276 numerical stability, or `None` if no shift is to be performed. A shift

1277 close to the true mean provides the most numerically stable results.

1278 keepdims: produce statistics with the same dimensionality as the input.

1279 name: Name used to scope the operations that compute the sufficient stats.

1280

1281 Returns:

1282 Four `Tensor` objects of the same type as `x`:

1283

1284 * the count (number of elements to average over).

1285 * the (possibly shifted) sum of the elements in the array.

1286 * the (possibly shifted) sum of squares of the elements in the array.

1287 * the shift by which the mean must be corrected or None if `shift` is None.

1288 """

1289 return sufficient_statistics(

1290 x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)

1291

1292

1293@tf_export("nn.normalize_moments")

1294@dispatch.add_dispatch_support

1295def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):

1296 """Calculate the mean and variance of based on the sufficient statistics.

1297

1298 Args:

1299 counts: A `Tensor` containing the total count of the data (one value).

1300 mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly

1301 shifted) sum of the elements to average over.

1302 variance_ss: A `Tensor` containing the variance sufficient statistics: the

1303 (possibly shifted) squared sum of the data to compute the variance over.

1304 shift: A `Tensor` containing the value by which the data is shifted for

1305 numerical stability, or `None` if no shift was performed.

1306 name: Name used to scope the operations that compute the moments.

1307

1308 Returns:

1309 Two `Tensor` objects: `mean` and `variance`.

1310 """

1311 with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]):

1312 divisor = math_ops.reciprocal(counts, name="divisor")

1313 if shift is not None:

1314 shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean")

1315 mean = math_ops.add(shifted_mean, shift, name="mean")

1316 else: # no shift.

1317 shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean")

1318 mean = shifted_mean

1319 variance = math_ops.subtract(

1320 math_ops.multiply(variance_ss, divisor),

1321 math_ops.square(shifted_mean),

1322 name="variance")

1323 return (mean, variance)

1324

1325

1326@tf_export(v1=["nn.moments"])

1327@dispatch.add_dispatch_support

1328def moments(

1329 x,

1330 axes,

1331 shift=None, # pylint: disable=unused-argument

1332 name=None,

1333 keep_dims=None,

1334 keepdims=None):

1335 """Calculate the mean and variance of `x`.

1336

1337 The mean and variance are calculated by aggregating the contents of `x`

1338 across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean

1339 and variance of a vector.

1340

1341 Note: shift is currently not used; the true mean is computed and used.

1342

1343 When using these moments for batch normalization (see

1344 `tf.nn.batch_normalization`):

1345

1346 * for so-called "global normalization", used with convolutional filters with

1347 shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.

1348 * for simple batch normalization pass `axes=[0]` (batch only).

1349

1350 Args:

1351 x: A `Tensor`.

1352 axes: Array of ints. Axes along which to compute mean and

1353 variance.

1354 shift: Not used in the current implementation

1355 name: Name used to scope the operations that compute the moments.

1356 keep_dims: produce moments with the same dimensionality as the input.

1357 keepdims: Alias to keep_dims.

1358

1359 Returns:

1360 Two `Tensor` objects: `mean` and `variance`.

1361 """

1362 keep_dims = deprecated_argument_lookup(

1363 "keepdims", keepdims, "keep_dims", keep_dims)

1364 if keep_dims is None:

1365 keep_dims = False

1366 with ops.name_scope(name, "moments", [x, axes]):

1367 # The dynamic range of fp16 is too limited to support the collection of

1368 # sufficient statistics. As a workaround we simply perform the operations

1369 # on 32-bit floats before converting the mean and variance back to fp16

1370 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x

1371 # Compute true mean while keeping the dims for proper broadcasting.

1372 mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")

1373 # sample variance, not unbiased variance

1374 # Note: stop_gradient does not change the gradient that gets

1375 # backpropagated to the mean from the variance calculation,

1376 # because that gradient is zero

1377 variance = math_ops.reduce_mean(

1378 math_ops.squared_difference(y, array_ops.stop_gradient(mean)),

1379 axes,

1380 keepdims=True,

1381 name="variance")

1382 if not keep_dims:

1383 mean = array_ops.squeeze(mean, axes)

1384 variance = array_ops.squeeze(variance, axes)

1385 if x.dtype == dtypes.float16:

1386 return (math_ops.cast(mean, dtypes.float16),

1387 math_ops.cast(variance, dtypes.float16))

1388 else:

1389 return (mean, variance)

1390

1391

1392@tf_export("nn.moments", v1=[])

1393@dispatch.add_dispatch_support

1394def moments_v2(

1395 x,

1396 axes,

1397 shift=None,

1398 keepdims=False,

1399 name=None):

1400 """Calculates the mean and variance of `x`.

1401

1402 The mean and variance are calculated by aggregating the contents of `x`

1403 across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean

1404 and variance of a vector.

1405

1406 Note: shift is currently not used; the true mean is computed and used.

1407

1408 When using these moments for batch normalization (see

1409 `tf.nn.batch_normalization`):

1410

1411 * for so-called "global normalization", used with convolutional filters with

1412 shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.

1413 * for simple batch normalization pass `axes=[0]` (batch only).

1414

1415 Args:

1416 x: A `Tensor`.

1417 axes: Array of ints. Axes along which to compute mean and

1418 variance.

1419 shift: Not used in the current implementation.

1420 keepdims: produce moments with the same dimensionality as the input.

1421 name: Name used to scope the operations that compute the moments.

1422

1423 Returns:

1424 Two `Tensor` objects: `mean` and `variance`.

1425 """

1426 return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)

1427

1428

1429@tf_export(v1=["nn.weighted_moments"])

1430@dispatch.add_dispatch_support

1431def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,

1432 keepdims=None):

1433 """Returns the frequency-weighted mean and variance of `x`.

1434

1435 Args:

1436 x: A tensor.

1437 axes: 1-d tensor of int32 values; these are the axes along which

1438 to compute mean and variance.

1439 frequency_weights: A tensor of positive weights which can be

1440 broadcast with x.

1441 name: Name used to scope the operation.

1442 keep_dims: Produce moments with the same dimensionality as the input.

1443 keepdims: Alias of keep_dims.

1444

1445 Returns:

1446 Two tensors: `weighted_mean` and `weighted_variance`.

1447 """

1448 keep_dims = deprecated_argument_lookup(

1449 "keepdims", keepdims, "keep_dims", keep_dims)

1450 if keep_dims is None:

1451 keep_dims = False

1452 with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):

1453 x = ops.convert_to_tensor(x, name="x")

1454 frequency_weights = ops.convert_to_tensor(

1455 frequency_weights, name="frequency_weights")

1456

1457 # Unlike moments(), this just uses a simpler two-pass method.

1458

1459 # See comment in moments() WRT precision; it applies here too.

1460 needs_cast = x.dtype == dtypes.float16

1461 if needs_cast:

1462 x = math_ops.cast(x, dtypes.float32)

1463

1464 if frequency_weights.dtype != x.dtype:

1465 frequency_weights = math_ops.cast(frequency_weights, x.dtype)

1466

1467 # Note that we use keep_dims=True for our reductions regardless of the arg;

1468 # this is so that the results remain broadcast-compatible with the inputs.

1469 weighted_input_sum = math_ops.reduce_sum(

1470 frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)

1471

1472 # The shape of the weights isn't necessarily the same as x's

1473 # shape, just broadcast-compatible with it -- so this expression

1474 # performs broadcasting to give a per-item weight, with the same

1475 # shape as (frequency_weights * x). This avoids having to reason

1476 # through all the broadcast logic to compute a correct

1477 # sum_of_weights.

1478 broadcasted_weights = frequency_weights + array_ops.zeros_like(x)

1479

1480 sum_of_weights = math_ops.reduce_sum(

1481 broadcasted_weights, axes, name="sum_of_weights", keepdims=True)

1482

1483 weighted_mean = math_ops.div_no_nan(weighted_input_sum, sum_of_weights)

1484

1485 # Have the weighted mean; now on to variance:

1486 weighted_distsq = math_ops.reduce_sum(

1487 frequency_weights * math_ops.squared_difference(x, weighted_mean),

1488 axes,

1489 name="weighted_distsq",

1490 keepdims=True)

1491

1492 weighted_variance = math_ops.div_no_nan(weighted_distsq, sum_of_weights)

1493

1494 if not keep_dims:

1495 weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)

1496 weighted_variance = array_ops.squeeze(

1497 weighted_variance, axis=axes)

1498

1499 if needs_cast:

1500 weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)

1501 weighted_variance = math_ops.cast(weighted_variance, dtypes.float16)

1502

1503 return weighted_mean, weighted_variance

1504

1505

1506@tf_export("nn.weighted_moments", v1=[])

1507@dispatch.add_dispatch_support

1508def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):

1509 """Returns the frequency-weighted mean and variance of `x`.

1510

1511 Args:

1512 x: A tensor.

1513 axes: 1-d tensor of int32 values; these are the axes along which

1514 to compute mean and variance.

1515 frequency_weights: A tensor of positive weights which can be

1516 broadcast with x.

1517 keepdims: Produce moments with the same dimensionality as the input.

1518 name: Name used to scope the operation.

1519

1520 Returns:

1521 Two tensors: `weighted_mean` and `weighted_variance`.

1522 """

1523 return weighted_moments(

1524 x=x,

1525 axes=axes,

1526 frequency_weights=frequency_weights,

1527 name=name,

1528 keep_dims=keepdims)

1529

1530

1531@tf_export("nn.batch_normalization")

1532@dispatch.add_dispatch_support

1533def batch_normalization(x,

1534 mean,

1535 variance,

1536 offset,

1537 scale,

1538 variance_epsilon,

1539 name=None):

1540 r"""Batch normalization.

1541

1542 Normalizes a tensor by `mean` and `variance`, and applies (optionally) a

1543 `scale` \$\gamma\$ to it, as well as an `offset` \$\beta\$:

1544

1545 \$\frac{\gamma(x-\mu)}{\sigma}+\beta\$

1546

1547 `mean`, `variance`, `offset` and `scale` are all expected to be of one of two

1548 shapes:

1549

1550 * In all generality, they can have the same number of dimensions as the

1551 input `x`, with identical sizes as `x` for the dimensions that are not

1552 normalized over (the 'depth' dimension(s)), and dimension 1 for the

1553 others which are being normalized over.

1554 `mean` and `variance` in this case would typically be the outputs of

1555 `tf.nn.moments(..., keepdims=True)` during training, or running averages

1556 thereof during inference.

1557 * In the common case where the 'depth' dimension is the last dimension in

1558 the input tensor `x`, they may be one dimensional tensors of the same

1559 size as the 'depth' dimension.

1560 This is the case for example for the common `[batch, depth]` layout of

1561 fully-connected layers, and `[batch, height, width, depth]` for

1562 convolutions.

1563 `mean` and `variance` in this case would typically be the outputs of

1564 `tf.nn.moments(..., keepdims=False)` during training, or running averages

1565 thereof during inference.

1566

1567 See equation 11 in Algorithm 2 of source:

1568 [Batch Normalization: Accelerating Deep Network Training by

1569 Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]

1570 (http://arxiv.org/abs/1502.03167).

1571

1572 Args:

1573 x: Input `Tensor` of arbitrary dimensionality.

1574 mean: A mean `Tensor`.

1575 variance: A variance `Tensor`.

1576 offset: An offset `Tensor`, often denoted \$\beta\$ in equations, or

1577 None. If present, will be added to the normalized tensor.

1578 scale: A scale `Tensor`, often denoted \$\gamma\$ in equations, or

1579 `None`. If present, the scale is applied to the normalized tensor.

1580 variance_epsilon: A small float number to avoid dividing by 0.

1581 name: A name for this operation (optional).

1582

1583 Returns:

1584 the normalized, scaled, offset tensor.

1585

1586 References:

1587 Batch Normalization - Accelerating Deep Network Training by Reducing

1588 Internal Covariate Shift:

1589 [Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)

1590 ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))

1591 """

1592 with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):

1593 inv = math_ops.rsqrt(variance + variance_epsilon)

1594 if scale is not None:

1595 inv *= scale

1596 # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on

1597 # the precise order of ops that are generated by the expression below.

1598 return x * math_ops.cast(inv, x.dtype) + math_ops.cast(

1599 offset - mean * inv if offset is not None else -mean * inv, x.dtype)

1600

1601

1602@tf_export(v1=["nn.fused_batch_norm"])

1603@dispatch.add_dispatch_support

1604def fused_batch_norm(

1605 x,

1606 scale,

1607 offset, # pylint: disable=invalid-name

1608 mean=None,

1609 variance=None,

1610 epsilon=0.001,

1611 data_format="NHWC",

1612 is_training=True,

1613 name=None,

1614 exponential_avg_factor=1.0):

1615 r"""Batch normalization.

1616

1617

1618 See Source: [Batch Normalization: Accelerating Deep Network Training by

1619 Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]

1620 (http://arxiv.org/abs/1502.03167).

1621

1622 Args:

1623 x: Input `Tensor` of 4 or 5 dimensions.

1624 scale: A `Tensor` of 1 dimension for scaling.

1625 offset: A `Tensor` of 1 dimension for bias.

1626 mean: A `Tensor` of 1 dimension for population mean. The shape and meaning

1627 of this argument depends on the value of is_training and

1628 exponential_avg_factor as follows:

1629 is_training==False (inference):

1630 Mean must be a `Tensor` of the same shape as scale containing the

1631 estimated population mean computed during training.

1632 is_training==True and exponential_avg_factor == 1.0:

1633 Mean must be None.

1634 is_training==True and exponential_avg_factor != 1.0:

1635 Mean must be a `Tensor` of the same shape as scale containing the

1636 exponential running mean.

1637 variance: A `Tensor` of 1 dimension for population variance. The shape and

1638 meaning of this argument depends on the value of is_training and

1639 exponential_avg_factor as follows:

1640 is_training==False (inference):

1641 Variance must be a `Tensor` of the same shape as scale containing

1642 the estimated population variance computed during training.

1643 is_training==True and exponential_avg_factor == 1.0:

1644 Variance must be None.

1645 is_training==True and exponential_avg_factor != 1.0:

1646 Variance must be a `Tensor` of the same shape as scale containing

1647 the exponential running variance.

1648 epsilon: A small float number added to the variance of x.

1649 data_format: The data format for x. Support "NHWC" (default) or "NCHW" for

1650 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.

1651 is_training: A bool value to specify if the operation is used for

1652 training or inference.

1653 name: A name for this operation (optional).

1654 exponential_avg_factor: A float number (usually between 0 and 1) used

1655 for controlling the decay of the running

1656 population average of mean and variance.

1657 If set to 1.0, the current batch average is

1658 returned.

1659

1660 Returns:

1661 y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.

1662 running_mean: A 1D Tensor for the exponential running mean of x.

1663 The output value is (1 - exponential_avg_factor) * mean +

1664 exponential_avg_factor * batch_mean), where batch_mean

1665 is the mean of the current batch in x.

1666 running_var: A 1D Tensor for the exponential running variance

1667 The output value is (1 - exponential_avg_factor) * variance +

1668 exponential_avg_factor * batch_variance), where batch_variance

1669 is the variance of the current batch in x.

1670

1671 References:

1672 Batch Normalization - Accelerating Deep Network Training by Reducing

1673 Internal Covariate Shift:

1674 [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)

1675 ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))

1676 """

1677 if (not is_training or exponential_avg_factor != 1.0) and (

1678 (mean is None) or (variance is None)):

1679 raise ValueError("Both `mean` and `variance` must be a 1D tensor when "

1680 "`is_training` is False or `exponential_avg_factor` != "

1681 f"1.0. Received: `mean` {mean!r} and `variance` "

1682 f"{variance!r}")

1683 x = ops.convert_to_tensor(x, name="input")

1684 scale = ops.convert_to_tensor(scale, name="scale")

1685 offset = ops.convert_to_tensor(offset, name="offset")

1686 if mean is None:

1687 mean = constant_op.constant([])

1688 if variance is None:

1689 variance = constant_op.constant([])

1690

1691 y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(

1692 x,

1693 scale,

1694 offset,

1695 mean,

1696 variance,

1697 epsilon=epsilon,

1698 exponential_avg_factor=exponential_avg_factor,

1699 data_format=data_format,

1700 is_training=is_training,

1701 name=name)

1702 return y, running_mean, running_var

1703

1704

1705@tf_export(v1=["nn.batch_norm_with_global_normalization"])

1706@dispatch.add_dispatch_support

1707def batch_norm_with_global_normalization(t=None,

1708 m=None,

1709 v=None,

1710 beta=None,

1711 gamma=None,

1712 variance_epsilon=None,

1713 scale_after_normalization=None,

1714 name=None,

1715 input=None, # pylint: disable=redefined-builtin

1716 mean=None,

1717 variance=None):

1718 """Batch normalization.

1719

1720 This op is deprecated. See `tf.nn.batch_normalization`.

1721

1722 Args:

1723 t: A 4D input Tensor.

1724 m: A 1D mean Tensor with size matching the last dimension of t.

1725 This is the first output from tf.nn.moments,

1726 or a saved moving average thereof.

1727 v: A 1D variance Tensor with size matching the last dimension of t.

1728 This is the second output from tf.nn.moments,

1729 or a saved moving average thereof.

1730 beta: A 1D beta Tensor with size matching the last dimension of t.

1731 An offset to be added to the normalized tensor.

1732 gamma: A 1D gamma Tensor with size matching the last dimension of t.

1733 If "scale_after_normalization" is true, this tensor will be multiplied

1734 with the normalized tensor.

1735 variance_epsilon: A small float number to avoid dividing by 0.

1736 scale_after_normalization: A bool indicating whether the resulted tensor

1737 needs to be multiplied with gamma.

1738 name: A name for this operation (optional).

1739 input: Alias for t.

1740 mean: Alias for m.

1741 variance: Alias for v.

1742

1743 Returns:

1744 A batch-normalized `t`.

1745

1746 References:

1747 Batch Normalization - Accelerating Deep Network Training by Reducing

1748 Internal Covariate Shift:

1749 [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)

1750 ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))

1751 """

1752 t = deprecated_argument_lookup("input", input, "t", t)

1753 m = deprecated_argument_lookup("mean", mean, "m", m)

1754 v = deprecated_argument_lookup("variance", variance, "v", v)

1755 return batch_normalization(t, m, v, beta, gamma if scale_after_normalization

1756 else None, variance_epsilon, name)

1757

1758

1759# pylint: disable=redefined-builtin,line-too-long

1760@tf_export("nn.batch_norm_with_global_normalization", v1=[])

1761@dispatch.add_dispatch_support

1762def batch_norm_with_global_normalization_v2(input,

1763 mean,

1764 variance,

1765 beta,

1766 gamma,

1767 variance_epsilon,

1768 scale_after_normalization,

1769 name=None):

1770 """Batch normalization.

1771

1772 This op is deprecated. See `tf.nn.batch_normalization`.

1773

1774 Args:

1775 input: A 4D input Tensor.

1776 mean: A 1D mean Tensor with size matching the last dimension of t.

1777 This is the first output from tf.nn.moments,

1778 or a saved moving average thereof.

1779 variance: A 1D variance Tensor with size matching the last dimension of t.

1780 This is the second output from tf.nn.moments,

1781 or a saved moving average thereof.

1782 beta: A 1D beta Tensor with size matching the last dimension of t.

1783 An offset to be added to the normalized tensor.

1784 gamma: A 1D gamma Tensor with size matching the last dimension of t.

1785 If "scale_after_normalization" is true, this tensor will be multiplied

1786 with the normalized tensor.

1787 variance_epsilon: A small float number to avoid dividing by 0.

1788 scale_after_normalization: A bool indicating whether the resulted tensor

1789 needs to be multiplied with gamma.

1790 name: A name for this operation (optional).

1791

1792 Returns:

1793 A batch-normalized `t`.

1794

1795 References:

1796 Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:

1797 [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)

1798 ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))

1799 """

1800 return batch_norm_with_global_normalization(t=input,

1801 m=mean,

1802 v=variance,

1803 beta=beta,

1804 gamma=gamma,

1805 variance_epsilon=variance_epsilon,

1806 scale_after_normalization=scale_after_normalization,

1807 name=name)

1808

1809# pylint: enable=redefined-builtin,line-too-long

1810

1811

1812def _sum_rows(x):

1813 """Returns a vector summing up each row of the matrix x."""

1814 # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is

1815 # a matrix. The gradient of _sum_rows(x) is more efficient than

1816 # reduce_sum(x, 1)'s gradient in today's implementation. Therefore,

1817 # we use _sum_rows(x) in the nce_loss() computation since the loss

1818 # is mostly used for training.

1819 cols = array_ops.shape(x)[1]

1820 ones_shape = array_ops_stack.stack([cols, 1])

1821 ones = array_ops.ones(ones_shape, x.dtype)

1822 return array_ops.reshape(math_ops.matmul(x, ones), [-1])

1823

1824

1825def _compute_sampled_logits(weights,

1826 biases,

1827 labels,

1828 inputs,

1829 num_sampled,

1830 num_classes,

1831 num_true=1,

1832 sampled_values=None,

1833 subtract_log_q=True,

1834 remove_accidental_hits=False,

1835 partition_strategy="mod",

1836 name=None,

1837 seed=None):

1838 """Helper function for nce_loss and sampled_softmax_loss functions.

1839

1840 Computes sampled output training logits and labels suitable for implementing

1841 e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see

1842 sampled_softmax_loss).

1843

1844 Note: In the case where num_true > 1, we assign to each target class

1845 the target probability 1 / num_true so that the target probabilities

1846 sum to 1 per-example.

1847

1848 Args:

1849 weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`

1850 objects whose concatenation along dimension 0 has shape

1851 `[num_classes, dim]`. The (possibly-partitioned) class embeddings.

1852 biases: A `Tensor` of shape `[num_classes]`. The (possibly-partitioned)

1853 class biases.

1854 labels: A `Tensor` of type `int64` and shape `[batch_size,

1855 num_true]`. The target classes. Note that this format differs from

1856 the `labels` argument of `nn.softmax_cross_entropy_with_logits`.

1857 inputs: A `Tensor` of shape `[batch_size, dim]`. The forward

1858 activations of the input network.

1859 num_sampled: An `int`. The number of classes to randomly sample per batch.

1860 num_classes: An `int`. The number of possible classes.

1861 num_true: An `int`. The number of target classes per training example.

1862 sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,

1863 `sampled_expected_count`) returned by a `*_candidate_sampler` function.

1864 (if None, we default to `log_uniform_candidate_sampler`)

1865 subtract_log_q: A `bool`. whether to subtract the log expected count of

1866 the labels in the sample to get the logits of the true labels.

1867 Default is True. Turn off for Negative Sampling.

1868 remove_accidental_hits: A `bool`. whether to remove "accidental hits"

1869 where a sampled class equals one of the target classes. Default is

1870 False.

1871 partition_strategy: A string specifying the partitioning strategy, relevant

1872 if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.

1873 Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.

1874 name: A name for the operation (optional).

1875 seed: random seed for candidate sampling. Default to None, which doesn't set

1876 the op-level random seed for candidate sampling.

1877 Returns:

1878 out_logits: `Tensor` object with shape

1879 `[batch_size, num_true + num_sampled]`, for passing to either

1880 `nn.sigmoid_cross_entropy_with_logits` (NCE) or

1881 `nn.softmax_cross_entropy_with_logits` (sampled softmax).

1882 out_labels: A Tensor object with the same shape as `out_logits`.

1883 """

1884

1885 if isinstance(weights, variables.PartitionedVariable):

1886 weights = list(weights)

1887 if not isinstance(weights, list):

1888 weights = [weights]

1889

1890 with ops.name_scope(name, "compute_sampled_logits",

1891 weights + [biases, inputs, labels]):

1892 if labels.dtype != dtypes.int64:

1893 labels = math_ops.cast(labels, dtypes.int64)

1894 labels_flat = array_ops.reshape(labels, [-1])

1895

1896 # Sample the negative labels.

1897 # sampled shape: [num_sampled] tensor

1898 # true_expected_count shape = [batch_size, 1] tensor

1899 # sampled_expected_count shape = [num_sampled] tensor

1900 if sampled_values is None:

1901 sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(

1902 true_classes=labels,

1903 num_true=num_true,

1904 num_sampled=num_sampled,

1905 unique=True,

1906 range_max=num_classes,

1907 seed=seed)

1908 # NOTE: pylint cannot tell that 'sampled_values' is a sequence

1909 # pylint: disable=unpacking-non-sequence

1910 sampled, true_expected_count, sampled_expected_count = (

1911 array_ops.stop_gradient(s) for s in sampled_values)

1912 # pylint: enable=unpacking-non-sequence

1913 sampled = math_ops.cast(sampled, dtypes.int64)

1914

1915 # labels_flat is a [batch_size * num_true] tensor

1916 # sampled is a [num_sampled] int tensor

1917 all_ids = array_ops.concat([labels_flat, sampled], 0)

1918

1919 # Retrieve the true weights and the logits of the sampled weights.

1920

1921 # weights shape is [num_classes, dim]

1922 all_w = embedding_ops.embedding_lookup(

1923 weights, all_ids, partition_strategy=partition_strategy)

1924 if all_w.dtype != inputs.dtype:

1925 all_w = math_ops.cast(all_w, inputs.dtype)

1926

1927 # true_w shape is [batch_size * num_true, dim]

1928 true_w = array_ops.slice(all_w, [0, 0],

1929 array_ops_stack.stack(

1930 [array_ops.shape(labels_flat)[0], -1]))

1931

1932 sampled_w = array_ops.slice(

1933 all_w,

1934 array_ops_stack.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])

1935 # inputs has shape [batch_size, dim]

1936 # sampled_w has shape [num_sampled, dim]

1937 # Apply X*W', which yields [batch_size, num_sampled]

1938 sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)

1939

1940 # Retrieve the true and sampled biases, compute the true logits, and

1941 # add the biases to the true and sampled logits.

1942 all_b = embedding_ops.embedding_lookup(

1943 biases, all_ids, partition_strategy=partition_strategy)

1944 if all_b.dtype != inputs.dtype:

1945 all_b = math_ops.cast(all_b, inputs.dtype)

1946 # true_b is a [batch_size * num_true] tensor

1947 # sampled_b is a [num_sampled] float tensor

1948 true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))

1949 sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

1950

1951 # inputs shape is [batch_size, dim]

1952 # true_w shape is [batch_size * num_true, dim]

1953 # row_wise_dots is [batch_size, num_true, dim]

1954 dim = array_ops.shape(true_w)[1:2]

1955 new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)

1956 row_wise_dots = math_ops.multiply(

1957 array_ops.expand_dims(inputs, 1),

1958 array_ops.reshape(true_w, new_true_w_shape))

1959 # We want the row-wise dot plus biases which yields a

1960 # [batch_size, num_true] tensor of true_logits.

1961 dots_as_matrix = array_ops.reshape(row_wise_dots,

1962 array_ops.concat([[-1], dim], 0))

1963 true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])

1964 true_b = array_ops.reshape(true_b, [-1, num_true])

1965 true_logits += true_b

1966 sampled_logits += sampled_b

1967

1968 if remove_accidental_hits:

1969 acc_hits = candidate_sampling_ops.compute_accidental_hits(

1970 labels, sampled, num_true=num_true)

1971 acc_indices, acc_ids, acc_weights = acc_hits

1972

1973 # This is how SparseToDense expects the indices.

1974 acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])

1975 acc_ids_2d_int32 = array_ops.reshape(

1976 math_ops.cast(acc_ids, dtypes.int32), [-1, 1])

1977 sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,

1978 "sparse_indices")

1979 # Create sampled_logits_shape = [batch_size, num_sampled]

1980 sampled_logits_shape = array_ops.concat(

1981 [array_ops.shape(labels)[:1],

1982 array_ops.expand_dims(num_sampled, 0)], 0)

1983 if sampled_logits.dtype != acc_weights.dtype:

1984 acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)

1985 sampled_logits += gen_sparse_ops.sparse_to_dense(

1986 sparse_indices,

1987 sampled_logits_shape,

1988 acc_weights,

1989 default_value=0.0,

1990 validate_indices=False)

1991

1992 if subtract_log_q:

1993 # Subtract log of Q(l), prior probability that l appears in sampled.

1994 true_logits -= math_ops.log(true_expected_count)

1995 sampled_logits -= math_ops.log(sampled_expected_count)

1996

1997 # Construct output logits and labels. The true labels/logits start at col 0.

1998 out_logits = array_ops.concat([true_logits, sampled_logits], 1)

1999

2000 # true_logits is a float tensor, ones_like(true_logits) is a float

2001 # tensor of ones. We then divide by num_true to ensure the per-example

2002 # labels sum to 1.0, i.e. form a proper probability distribution.

2003 out_labels = array_ops.concat([

2004 array_ops.ones_like(true_logits) / num_true,

2005 array_ops.zeros_like(sampled_logits)

2006 ], 1)

2007

2008 return out_logits, out_labels

2009

2010

2011@tf_export("nn.nce_loss", v1=[])

2012@dispatch.add_dispatch_support

2013def nce_loss_v2(weights,

2014 biases,

2015 labels,

2016 inputs,

2017 num_sampled,

2018 num_classes,

2019 num_true=1,

2020 sampled_values=None,

2021 remove_accidental_hits=False,

2022 name="nce_loss"):

2023 """Computes and returns the noise-contrastive estimation training loss.

2024

2025 See [Noise-contrastive estimation: A new estimation principle for

2026 unnormalized statistical

2027 models](https://arxiv.org/abs/1806.03664).

2028 Also see our [Candidate Sampling Algorithms

2029 Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)

2030

2031 A common use case is to use this method for training, and calculate the full

2032 sigmoid loss for evaluation or inference as in the following example:

2033

2034 ```python

2035 if mode == "train":

2036 loss = tf.nn.nce_loss(

2037 weights=weights,

2038 biases=biases,

2039 labels=labels,

2040 inputs=inputs,

2041 ...)

2042 elif mode == "eval":

2043 logits = tf.matmul(inputs, tf.transpose(weights))

2044 logits = tf.nn.bias_add(logits, biases)

2045 labels_one_hot = tf.one_hot(labels, n_classes)

2046 loss = tf.nn.sigmoid_cross_entropy_with_logits(

2047 labels=labels_one_hot,

2048 logits=logits)

2049 loss = tf.reduce_sum(loss, axis=1)

2050 ```

2051

2052 Note: when doing embedding lookup on `weights` and `bias`, "div" partition

2053 strategy will be used. Support for other partition strategy will be added

2054 later.

2055

2056 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,

2057 so your labels must be sorted in order of decreasing frequency to achieve

2058 good results. For more details, see

2059 `tf.random.log_uniform_candidate_sampler`.

2060

2061 Note: In the case where `num_true` > 1, we assign to each target class

2062 the target probability 1 / `num_true` so that the target probabilities

2063 sum to 1 per-example.

2064

2065 Note: It would be useful to allow a variable number of target classes per

2066 example. We hope to provide this functionality in a future release.

2067 For now, if you have a variable number of target classes, you can pad them

2068 out to a constant number by either repeating them or by padding

2069 with an otherwise unused class.

2070

2071 Args:

2072 weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`

2073 objects whose concatenation along dimension 0 has shape [num_classes,

2074 dim]. The (possibly-partitioned) class embeddings.

2075 biases: A `Tensor` of shape `[num_classes]`. The class biases.

2076 labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The

2077 target classes.

2078 inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of

2079 the input network.

2080 num_sampled: An `int`. The number of negative classes to randomly sample

2081 per batch. This single sample of negative classes is evaluated for each

2082 element in the batch.

2083 num_classes: An `int`. The number of possible classes.

2084 num_true: An `int`. The number of target classes per training example.

2085 sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,

2086 `sampled_expected_count`) returned by a `*_candidate_sampler` function.

2087 (if None, we default to `log_uniform_candidate_sampler`)

2088 remove_accidental_hits: A `bool`. Whether to remove "accidental hits"

2089 where a sampled class equals one of the target classes. If set to `True`,

2090 this is a "Sampled Logistic" loss instead of NCE, and we are learning to

2091 generate log-odds instead of log probabilities. See our [Candidate

2092 Sampling Algorithms Reference]

2093 (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is

2094 False.

2095 name: A name for the operation (optional).

2096

2097 Returns:

2098 A `batch_size` 1-D tensor of per-example NCE losses.

2099 """

2100 # TODO(yuefengz): get partition_strategy from either variables or distribution

2101 # strategies.

2102 return nce_loss(

2103 weights,

2104 biases,

2105 labels,

2106 inputs,

2107 num_sampled,

2108 num_classes,

2109 num_true=num_true,

2110 sampled_values=sampled_values,

2111 remove_accidental_hits=remove_accidental_hits,

2112 partition_strategy="div",

2113 name=name)

2114

2115

2116@tf_export(v1=["nn.nce_loss"])

2117@dispatch.add_dispatch_support

2118def nce_loss(weights,

2119 biases,

2120 labels,

2121 inputs,

2122 num_sampled,

2123 num_classes,

2124 num_true=1,

2125 sampled_values=None,

2126 remove_accidental_hits=False,

2127 partition_strategy="mod",

2128 name="nce_loss"):

2129 """Computes and returns the noise-contrastive estimation training loss.

2130

2131 A common use case is to use this method for training, and calculate the full

2132 sigmoid loss for evaluation or inference. In this case, you must set

2133 `partition_strategy="div"` for the two losses to be consistent, as in the

2134 following example:

2135

2136 ```python

2137 if mode == "train":

2138 loss = tf.nn.nce_loss(

2139 weights=weights,

2140 biases=biases,

2141 labels=labels,

2142 inputs=inputs,

2143 ...,

2144 partition_strategy="div")

2145 elif mode == "eval":

2146 logits = tf.matmul(inputs, tf.transpose(weights))

2147 logits = tf.nn.bias_add(logits, biases)

2148 labels_one_hot = tf.one_hot(labels, n_classes)

2149 loss = tf.nn.sigmoid_cross_entropy_with_logits(

2150 labels=labels_one_hot,

2151 logits=logits)

2152 loss = tf.reduce_sum(loss, axis=1)

2153 ```

2154

2155 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,

2156 so your labels must be sorted in order of decreasing frequency to achieve

2157 good results. For more details, see

2158 `tf.random.log_uniform_candidate_sampler`.

2159

2160 Note: In the case where `num_true` > 1, we assign to each target class

2161 the target probability 1 / `num_true` so that the target probabilities

2162 sum to 1 per-example.

2163

2164 Note: It would be useful to allow a variable number of target classes per

2165 example. We hope to provide this functionality in a future release.

2166 For now, if you have a variable number of target classes, you can pad them

2167 out to a constant number by either repeating them or by padding

2168 with an otherwise unused class.

2169

2170 Args:

2171 weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`

2172 objects whose concatenation along dimension 0 has shape

2173 [num_classes, dim]. The (possibly-partitioned) class embeddings.

2174 biases: A `Tensor` of shape `[num_classes]`. The class biases.

2175 labels: A `Tensor` of type `int64` and shape `[batch_size,

2176 num_true]`. The target classes.

2177 inputs: A `Tensor` of shape `[batch_size, dim]`. The forward

2178 activations of the input network.

2179 num_sampled: An `int`. The number of negative classes to randomly sample

2180 per batch. This single sample of negative classes is evaluated for each

2181 element in the batch.

2182 num_classes: An `int`. The number of possible classes.

2183 num_true: An `int`. The number of target classes per training example.

2184 sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,

2185 `sampled_expected_count`) returned by a `*_candidate_sampler` function.

2186 (if None, we default to `log_uniform_candidate_sampler`)

2187 remove_accidental_hits: A `bool`. Whether to remove "accidental hits"

2188 where a sampled class equals one of the target classes. If set to

2189 `True`, this is a "Sampled Logistic" loss instead of NCE, and we are

2190 learning to generate log-odds instead of log probabilities. See

2191 our Candidate Sampling Algorithms Reference

2192 ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).

2193 Default is False.

2194 partition_strategy: A string specifying the partitioning strategy, relevant

2195 if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.

2196 Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.

2197 name: A name for the operation (optional).

2198

2199 Returns:

2200 A `batch_size` 1-D tensor of per-example NCE losses.

2201

2202 References:

2203 Noise-contrastive estimation - A new estimation principle for unnormalized

2204 statistical models:

2205 [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)

2206 ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))

2207 """

2208 logits, labels = _compute_sampled_logits(

2209 weights=weights,

2210 biases=biases,

2211 labels=labels,

2212 inputs=inputs,

2213 num_sampled=num_sampled,

2214 num_classes=num_classes,

2215 num_true=num_true,

2216 sampled_values=sampled_values,

2217 subtract_log_q=True,

2218 remove_accidental_hits=remove_accidental_hits,

2219 partition_strategy=partition_strategy,

2220 name=name)

2221 sampled_losses = sigmoid_cross_entropy_with_logits(

2222 labels=labels, logits=logits, name="sampled_losses")

2223 # sampled_losses is batch_size x {true_loss, sampled_losses...}

2224 # We sum out true and sampled losses.

2225 return _sum_rows(sampled_losses)

2226

2227

2228@tf_export("nn.sampled_softmax_loss", v1=[])

2229@dispatch.add_dispatch_support

2230def sampled_softmax_loss_v2(weights,

2231 biases,

2232 labels,

2233 inputs,

2234 num_sampled,

2235 num_classes,

2236 num_true=1,

2237 sampled_values=None,

2238 remove_accidental_hits=True,

2239 seed=None,

2240 name="sampled_softmax_loss"):

2241 """Computes and returns the sampled softmax training loss.

2242

2243 This is a faster way to train a softmax classifier over a huge number of

2244 classes.

2245

2246 This operation is for training only. It is generally an underestimate of

2247 the full softmax loss.

2248

2249 A common use case is to use this method for training, and calculate the full

2250 softmax loss for evaluation or inference as in the following example:

2251

2252 ```python

2253 if mode == "train":

2254 loss = tf.nn.sampled_softmax_loss(

2255 weights=weights,

2256 biases=biases,

2257 labels=labels,

2258 inputs=inputs,

2259 ...)

2260 elif mode == "eval":

2261 logits = tf.matmul(inputs, tf.transpose(weights))

2262 logits = tf.nn.bias_add(logits, biases)

2263 labels_one_hot = tf.one_hot(labels, n_classes)

2264 loss = tf.nn.softmax_cross_entropy_with_logits(

2265 labels=labels_one_hot,

2266 logits=logits)

2267 ```

2268

2269 See our [Candidate Sampling Algorithms Reference]

2270 (https://www.tensorflow.org/extras/candidate_sampling.pdf)

2271

2272 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)

2273 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.

2274

2275 Note: when doing embedding lookup on `weights` and `bias`, "div" partition

2276 strategy will be used. Support for other partition strategy will be added

2277 later.

2278

2279 Args:

2280 weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`

2281 objects whose concatenation along dimension 0 has shape [num_classes,

2282 dim]. The (possibly-sharded) class embeddings.

2283 biases: A `Tensor` of shape `[num_classes]`. The class biases.

2284 labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The

2285 target classes. Note that this format differs from the `labels` argument

2286 of `nn.softmax_cross_entropy_with_logits`.

2287 inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of

2288 the input network.

2289 num_sampled: An `int`. The number of classes to randomly sample per batch.

2290 num_classes: An `int`. The number of possible classes.

2291 num_true: An `int`. The number of target classes per training example.

2292 sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,

2293 `sampled_expected_count`) returned by a `*_candidate_sampler` function.

2294 (if None, we default to `log_uniform_candidate_sampler`)

2295 remove_accidental_hits: A `bool`. whether to remove "accidental hits"

2296 where a sampled class equals one of the target classes. Default is True.

2297 seed: random seed for candidate sampling. Default to None, which doesn't set

2298 the op-level random seed for candidate sampling.

2299 name: A name for the operation (optional).

2300

2301 Returns:

2302 A `batch_size` 1-D tensor of per-example sampled softmax losses.

2303

2304 """

2305 return sampled_softmax_loss(

2306 weights,

2307 biases,

2308 labels,

2309 inputs,

2310 num_sampled,

2311 num_classes,

2312 num_true=num_true,

2313 sampled_values=sampled_values,

2314 remove_accidental_hits=remove_accidental_hits,

2315 partition_strategy="div",

2316 name=name,

2317 seed=seed)

2318

2319

2320@tf_export(v1=["nn.sampled_softmax_loss"])

2321@dispatch.add_dispatch_support

2322def sampled_softmax_loss(weights,

2323 biases,

2324 labels,

2325 inputs,

2326 num_sampled,

2327 num_classes,

2328 num_true=1,

2329 sampled_values=None,

2330 remove_accidental_hits=True,

2331 partition_strategy="mod",

2332 name="sampled_softmax_loss",

2333 seed=None):

2334 """Computes and returns the sampled softmax training loss.

2335

2336 This is a faster way to train a softmax classifier over a huge number of

2337 classes.

2338

2339 This operation is for training only. It is generally an underestimate of

2340 the full softmax loss.

2341

2342 A common use case is to use this method for training, and calculate the full

2343 softmax loss for evaluation or inference. In this case, you must set

2344 `partition_strategy="div"` for the two losses to be consistent, as in the

2345 following example:

2346

2347 ```python

2348 if mode == "train":

2349 loss = tf.nn.sampled_softmax_loss(

2350 weights=weights,

2351 biases=biases,

2352 labels=labels,

2353 inputs=inputs,

2354 ...,

2355 partition_strategy="div")

2356 elif mode == "eval":

2357 logits = tf.matmul(inputs, tf.transpose(weights))

2358 logits = tf.nn.bias_add(logits, biases)

2359 labels_one_hot = tf.one_hot(labels, n_classes)

2360 loss = tf.nn.softmax_cross_entropy_with_logits(

2361 labels=labels_one_hot,

2362 logits=logits)

2363 ```

2364

2365 See our Candidate Sampling Algorithms Reference

2366 ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).

2367 Also see Section 3 of (Jean et al., 2014) for the math.

2368

2369 Args:

2370 weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`

2371 objects whose concatenation along dimension 0 has shape

2372 [num_classes, dim]. The (possibly-sharded) class embeddings.

2373 biases: A `Tensor` of shape `[num_classes]`. The class biases.

2374 labels: A `Tensor` of type `int64` and shape `[batch_size,

2375 num_true]`. The target classes. Note that this format differs from

2376 the `labels` argument of `nn.softmax_cross_entropy_with_logits`.

2377 inputs: A `Tensor` of shape `[batch_size, dim]`. The forward

2378 activations of the input network.

2379 num_sampled: An `int`. The number of classes to randomly sample per batch.

2380 num_classes: An `int`. The number of possible classes.

2381 num_true: An `int`. The number of target classes per training example.

2382 sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,

2383 `sampled_expected_count`) returned by a `*_candidate_sampler` function.

2384 (if None, we default to `log_uniform_candidate_sampler`)

2385 remove_accidental_hits: A `bool`. whether to remove "accidental hits"

2386 where a sampled class equals one of the target classes. Default is

2387 True.

2388 partition_strategy: A string specifying the partitioning strategy, relevant

2389 if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.

2390 Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.

2391 name: A name for the operation (optional).

2392 seed: random seed for candidate sampling. Default to None, which doesn't set

2393 the op-level random seed for candidate sampling.

2394

2395 Returns:

2396 A `batch_size` 1-D tensor of per-example sampled softmax losses.

2397

2398 References:

2399 On Using Very Large Target Vocabulary for Neural Machine Translation:

2400 [Jean et al., 2014]

2401 (https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)

2402 ([pdf](http://aclweb.org/anthology/P15-1001))

2403 """

2404 logits, labels = _compute_sampled_logits(

2405 weights=weights,

2406 biases=biases,

2407 labels=labels,

2408 inputs=inputs,

2409 num_sampled=num_sampled,

2410 num_classes=num_classes,

2411 num_true=num_true,

2412 sampled_values=sampled_values,

2413 subtract_log_q=True,

2414 remove_accidental_hits=remove_accidental_hits,

2415 partition_strategy=partition_strategy,

2416 name=name,

2417 seed=seed)

2418 labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")

2419 sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(

2420 labels=labels, logits=logits)

2421 # sampled_losses is a [batch_size] tensor.

2422 return sampled_losses

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/nn_impl.py: 29%

431 statements