Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/candidate_sampling_ops.py: 67%
42 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
16"""Wrappers for candidate sampling operations."""
18from tensorflow.python.framework import random_seed
19from tensorflow.python.ops import array_ops # pylint: disable=unused-import
20from tensorflow.python.ops import gen_candidate_sampling_ops
21from tensorflow.python.ops import math_ops # pylint: disable=unused-import
22from tensorflow.python.util import deprecation
23from tensorflow.python.util import dispatch
24from tensorflow.python.util.tf_export import tf_export
27@tf_export(
28 'random.uniform_candidate_sampler',
29 v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
30@dispatch.add_dispatch_support
31@deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
32def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
33 range_max, seed=None, name=None):
34 """Samples a set of classes using a uniform base distribution.
36 This operation randomly samples a tensor of sampled classes
37 (`sampled_candidates`) from the range of integers `[0, range_max)`.
39 See the [Candidate Sampling Algorithms
40 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
41 for a quick course on Candidate Sampling.
43 The elements of `sampled_candidates` are drawn without replacement
44 (if `unique=True`) or with replacement (if `unique=False`) from
45 the base distribution.
47 The base distribution for this operation is the uniform distribution
48 over the range of integers `[0, range_max)`.
50 In addition, this operation returns tensors `true_expected_count`
51 and `sampled_expected_count` representing the number of times each
52 of the target classes (`true_classes`) and the sampled
53 classes (`sampled_candidates`) is expected to occur in an average
54 tensor of sampled classes. These values correspond to `Q(y|x)`
55 defined in the [Candidate Sampling Algorithms
56 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
57 If `unique=True`, then these are post-rejection probabilities and we
58 compute them approximately.
60 Note that this function (and also other `*_candidate_sampler`
61 functions) only gives you the ingredients to implement the various
62 Candidate Sampling algorithms listed in the big table in the
63 [Candidate Sampling Algorithms
64 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
65 still need to implement the algorithms yourself.
67 For example, according to that table, the phrase "negative samples"
68 may mean different things in different algorithms. For instance, in
69 NCE, "negative samples" means `S_i` (which is just the sampled
70 classes) which may overlap with true classes, while in Sampled
71 Logistic, "negative samples" means `S_i - T_i` which excludes the
72 true classes. The return value `sampled_candidates` corresponds to
73 `S_i`, not to any specific definition of "negative samples" in any
74 specific algorithm. It's your responsibility to pick an algorithm
75 and calculate the "negative samples" defined by that algorithm
76 (e.g. `S_i - T_i`).
78 As another example, the `true_classes` argument is for calculating
79 the `true_expected_count` output (as a by-product of this function's
80 main calculation), which may be needed by some algorithms (according
81 to that table). It's not for excluding true classes in the return
82 value `sampled_candidates`. Again that step is algorithm-specific
83 and should be carried out by you.
85 Args:
86 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
87 num_true]`. The target classes.
88 num_true: An `int`. The number of target classes per training example.
89 num_sampled: An `int`. The number of classes to randomly sample. The
90 `sampled_candidates` return value will have shape `[num_sampled]`. If
91 `unique=True`, `num_sampled` must be less than or equal to `range_max`.
92 unique: A `bool`. Determines whether all sampled classes in a batch are
93 unique.
94 range_max: An `int`. The number of possible classes.
95 seed: An `int`. An operation-specific seed. Default is 0.
96 name: A name for the operation (optional).
98 Returns:
99 sampled_candidates: A tensor of type `int64` and shape
100 `[num_sampled]`. The sampled classes, either with possible
101 duplicates (`unique=False`) or all unique (`unique=True`). As
102 noted above, `sampled_candidates` may overlap with true classes.
103 true_expected_count: A tensor of type `float`. Same shape as
104 `true_classes`. The expected counts under the sampling distribution
105 of each of `true_classes`.
106 sampled_expected_count: A tensor of type `float`. Same shape as
107 `sampled_candidates`. The expected counts under the sampling distribution
108 of each of `sampled_candidates`.
109 """
110 seed1, seed2 = random_seed.get_seed(seed)
111 return gen_candidate_sampling_ops.uniform_candidate_sampler(
112 true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
113 seed2=seed2, name=name)
116@tf_export(
117 'random.log_uniform_candidate_sampler',
118 v1=[
119 'random.log_uniform_candidate_sampler',
120 'nn.log_uniform_candidate_sampler'
121 ])
122@dispatch.add_dispatch_support
123@deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
124def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
125 range_max, seed=None, name=None):
126 """Samples a set of classes using a log-uniform (Zipfian) base distribution.
128 This operation randomly samples a tensor of sampled classes
129 (`sampled_candidates`) from the range of integers `[0, range_max)`.
131 See the [Candidate Sampling Algorithms
132 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
133 for a quick course on Candidate Sampling.
135 The elements of `sampled_candidates` are drawn without replacement
136 (if `unique=True`) or with replacement (if `unique=False`) from
137 the base distribution.
139 The base distribution for this operation is an approximately log-uniform
140 or Zipfian distribution:
142 `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
144 This sampler is useful when the target classes approximately follow such
145 a distribution - for example, if the classes represent words in a lexicon
146 sorted in decreasing order of frequency. If your classes are not ordered by
147 decreasing frequency, do not use this op.
149 In addition, this operation returns tensors `true_expected_count`
150 and `sampled_expected_count` representing the number of times each
151 of the target classes (`true_classes`) and the sampled
152 classes (`sampled_candidates`) is expected to occur in an average
153 tensor of sampled classes. These values correspond to `Q(y|x)`
154 defined in the [Candidate Sampling Algorithms
155 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
156 If `unique=True`, then these are post-rejection probabilities and we
157 compute them approximately.
159 Note that this function (and also other `*_candidate_sampler`
160 functions) only gives you the ingredients to implement the various
161 Candidate Sampling algorithms listed in the big table in the
162 [Candidate Sampling Algorithms
163 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
164 still need to implement the algorithms yourself.
166 For example, according to that table, the phrase "negative samples"
167 may mean different things in different algorithms. For instance, in
168 NCE, "negative samples" means `S_i` (which is just the sampled
169 classes) which may overlap with true classes, while in Sampled
170 Logistic, "negative samples" means `S_i - T_i` which excludes the
171 true classes. The return value `sampled_candidates` corresponds to
172 `S_i`, not to any specific definition of "negative samples" in any
173 specific algorithm. It's your responsibility to pick an algorithm
174 and calculate the "negative samples" defined by that algorithm
175 (e.g. `S_i - T_i`).
177 As another example, the `true_classes` argument is for calculating
178 the `true_expected_count` output (as a by-product of this function's
179 main calculation), which may be needed by some algorithms (according
180 to that table). It's not for excluding true classes in the return
181 value `sampled_candidates`. Again that step is algorithm-specific
182 and should be carried out by you.
184 Args:
185 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
186 num_true]`. The target classes.
187 num_true: An `int`. The number of target classes per training example.
188 num_sampled: An `int`. The number of classes to randomly sample.
189 unique: A `bool`. Determines whether all sampled classes in a batch are
190 unique.
191 range_max: An `int`. The number of possible classes.
192 seed: An `int`. An operation-specific seed. Default is 0.
193 name: A name for the operation (optional).
195 Returns:
196 sampled_candidates: A tensor of type `int64` and shape
197 `[num_sampled]`. The sampled classes. As noted above,
198 `sampled_candidates` may overlap with true classes.
199 true_expected_count: A tensor of type `float`. Same shape as
200 `true_classes`. The expected counts under the sampling distribution
201 of each of `true_classes`.
202 sampled_expected_count: A tensor of type `float`. Same shape as
203 `sampled_candidates`. The expected counts under the sampling distribution
204 of each of `sampled_candidates`.
205 """
206 seed1, seed2 = random_seed.get_seed(seed)
207 return gen_candidate_sampling_ops.log_uniform_candidate_sampler(
208 true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
209 seed2=seed2, name=name)
212@tf_export(
213 'random.learned_unigram_candidate_sampler',
214 'nn.learned_unigram_candidate_sampler')
215@dispatch.add_dispatch_support
216@deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
217def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
218 unique, range_max, seed=None, name=None):
219 """Samples a set of classes from a distribution learned during training.
221 This operation randomly samples a tensor of sampled classes
222 (`sampled_candidates`) from the range of integers `[0, range_max)`.
224 See the [Candidate Sampling Algorithms
225 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
226 for a quick course on Candidate Sampling.
228 The elements of `sampled_candidates` are drawn without replacement
229 (if `unique=True`) or with replacement (if `unique=False`) from
230 the base distribution.
232 The base distribution for this operation is constructed on the fly
233 during training. It is a unigram distribution over the target
234 classes seen so far during training. Every integer in `[0, range_max)`
235 begins with a weight of 1, and is incremented by 1 each time it is
236 seen as a target class. The base distribution is not saved to checkpoints,
237 so it is reset when the model is reloaded.
239 In addition, this operation returns tensors `true_expected_count`
240 and `sampled_expected_count` representing the number of times each
241 of the target classes (`true_classes`) and the sampled
242 classes (`sampled_candidates`) is expected to occur in an average
243 tensor of sampled classes. These values correspond to `Q(y|x)`
244 defined in the [Candidate Sampling Algorithms
245 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
246 If `unique=True`, then these are post-rejection probabilities and we
247 compute them approximately.
249 Note that this function (and also other `*_candidate_sampler`
250 functions) only gives you the ingredients to implement the various
251 Candidate Sampling algorithms listed in the big table in the
252 [Candidate Sampling Algorithms
253 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
254 still need to implement the algorithms yourself.
256 For example, according to that table, the phrase "negative samples"
257 may mean different things in different algorithms. For instance, in
258 NCE, "negative samples" means `S_i` (which is just the sampled
259 classes) which may overlap with true classes, while in Sampled
260 Logistic, "negative samples" means `S_i - T_i` which excludes the
261 true classes. The return value `sampled_candidates` corresponds to
262 `S_i`, not to any specific definition of "negative samples" in any
263 specific algorithm. It's your responsibility to pick an algorithm
264 and calculate the "negative samples" defined by that algorithm
265 (e.g. `S_i - T_i`).
267 As another example, the `true_classes` argument is for calculating
268 the `true_expected_count` output (as a by-product of this function's
269 main calculation), which may be needed by some algorithms (according
270 to that table). It's not for excluding true classes in the return
271 value `sampled_candidates`. Again that step is algorithm-specific
272 and should be carried out by you.
274 Args:
275 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
276 num_true]`. The target classes.
277 num_true: An `int`. The number of target classes per training example.
278 num_sampled: An `int`. The number of classes to randomly sample.
279 unique: A `bool`. Determines whether all sampled classes in a batch are
280 unique.
281 range_max: An `int`. The number of possible classes.
282 seed: An `int`. An operation-specific seed. Default is 0.
283 name: A name for the operation (optional).
285 Returns:
286 sampled_candidates: A tensor of type `int64` and shape
287 `[num_sampled]`. The sampled classes. As noted above,
288 `sampled_candidates` may overlap with true classes.
289 true_expected_count: A tensor of type `float`. Same shape as
290 `true_classes`. The expected counts under the sampling distribution
291 of each of `true_classes`.
292 sampled_expected_count: A tensor of type `float`. Same shape as
293 `sampled_candidates`. The expected counts under the sampling distribution
294 of each of `sampled_candidates`.
296 """
297 seed1, seed2 = random_seed.get_seed(seed)
298 # Limiting to Max int32 value
299 if range_max > 2147483647:
300 raise ValueError(f'Value of range_max:{range_max} is too large to handle')
301 return gen_candidate_sampling_ops.learned_unigram_candidate_sampler(
302 true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
303 seed2=seed2, name=name)
306@tf_export('random.fixed_unigram_candidate_sampler',
307 'nn.fixed_unigram_candidate_sampler')
308@dispatch.add_dispatch_support
309def fixed_unigram_candidate_sampler(true_classes,
310 num_true,
311 num_sampled,
312 unique,
313 range_max,
314 vocab_file='',
315 distortion=1.0,
316 num_reserved_ids=0,
317 num_shards=1,
318 shard=0,
319 unigrams=(),
320 seed=None,
321 name=None):
322 """Samples a set of classes using the provided (fixed) base distribution.
324 This operation randomly samples a tensor of sampled classes
325 (`sampled_candidates`) from the range of integers `[0, range_max)`.
327 See the [Candidate Sampling Algorithms
328 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
329 for a quick course on Candidate Sampling.
331 The elements of `sampled_candidates` are drawn without replacement
332 (if `unique=True`) or with replacement (if `unique=False`) from
333 the base distribution.
335 The base distribution is read from a file or passed in as an
336 in-memory array. There is also an option to skew the distribution by
337 applying a distortion power to the weights.
339 In addition, this operation returns tensors `true_expected_count`
340 and `sampled_expected_count` representing the number of times each
341 of the target classes (`true_classes`) and the sampled
342 classes (`sampled_candidates`) is expected to occur in an average
343 tensor of sampled classes. These values correspond to `Q(y|x)`
344 defined in the [Candidate Sampling Algorithms
345 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
346 If `unique=True`, then these are post-rejection probabilities and we
347 compute them approximately.
349 Note that this function (and also other `*_candidate_sampler`
350 functions) only gives you the ingredients to implement the various
351 Candidate Sampling algorithms listed in the big table in the
352 [Candidate Sampling Algorithms
353 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
354 still need to implement the algorithms yourself.
356 For example, according to that table, the phrase "negative samples"
357 may mean different things in different algorithms. For instance, in
358 NCE, "negative samples" means `S_i` (which is just the sampled
359 classes) which may overlap with true classes, while in Sampled
360 Logistic, "negative samples" means `S_i - T_i` which excludes the
361 true classes. The return value `sampled_candidates` corresponds to
362 `S_i`, not to any specific definition of "negative samples" in any
363 specific algorithm. It's your responsibility to pick an algorithm
364 and calculate the "negative samples" defined by that algorithm
365 (e.g. `S_i - T_i`).
367 As another example, the `true_classes` argument is for calculating
368 the `true_expected_count` output (as a by-product of this function's
369 main calculation), which may be needed by some algorithms (according
370 to that table). It's not for excluding true classes in the return
371 value `sampled_candidates`. Again that step is algorithm-specific
372 and should be carried out by you.
374 Args:
375 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
376 num_true]`. The target classes.
377 num_true: An `int`. The number of target classes per training example.
378 num_sampled: An `int`. The number of classes to randomly sample.
379 unique: A `bool`. Determines whether all sampled classes in a batch are
380 unique.
381 range_max: An `int`. The number of possible classes.
382 vocab_file: Each valid line in this file (which should have a CSV-like
383 format) corresponds to a valid word ID. IDs are in sequential order,
384 starting from num_reserved_ids. The last entry in each line is expected
385 to be a value corresponding to the count or relative probability. Exactly
386 one of `vocab_file` and `unigrams` needs to be passed to this operation.
387 distortion: The distortion is used to skew the unigram probability
388 distribution. Each weight is first raised to the distortion's power
389 before adding to the internal unigram distribution. As a result,
390 `distortion = 1.0` gives regular unigram sampling (as defined by the vocab
391 file), and `distortion = 0.0` gives a uniform distribution.
392 num_reserved_ids: Optionally some reserved IDs can be added in the range
393 `[0, num_reserved_ids)` by the users. One use case is that a special
394 unknown word token is used as ID 0. These IDs will have a sampling
395 probability of 0.
396 num_shards: A sampler can be used to sample from a subset of the original
397 range in order to speed up the whole computation through parallelism. This
398 parameter (together with `shard`) indicates the number of partitions that
399 are being used in the overall computation.
400 shard: A sampler can be used to sample from a subset of the original range
401 in order to speed up the whole computation through parallelism. This
402 parameter (together with `num_shards`) indicates the particular partition
403 number of the operation, when partitioning is being used.
404 unigrams: A list of unigram counts or probabilities, one per ID in
405 sequential order. Exactly one of `vocab_file` and `unigrams` should be
406 passed to this operation.
407 seed: An `int`. An operation-specific seed. Default is 0.
408 name: A name for the operation (optional).
410 Returns:
411 sampled_candidates: A tensor of type `int64` and shape
412 `[num_sampled]`. The sampled classes. As noted above,
413 `sampled_candidates` may overlap with true classes.
414 true_expected_count: A tensor of type `float`. Same shape as
415 `true_classes`. The expected counts under the sampling distribution
416 of each of `true_classes`.
417 sampled_expected_count: A tensor of type `float`. Same shape as
418 `sampled_candidates`. The expected counts under the sampling distribution
419 of each of `sampled_candidates`.
421 """
422 seed1, seed2 = random_seed.get_seed(seed)
423 return gen_candidate_sampling_ops.fixed_unigram_candidate_sampler(
424 true_classes, num_true, num_sampled, unique, range_max,
425 vocab_file=vocab_file, distortion=distortion,
426 num_reserved_ids=num_reserved_ids, num_shards=num_shards, shard=shard,
427 unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
430@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler')
431def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
432 seed=None, name=None):
433 """Generate the set of all classes.
435 Deterministically generates and returns the set of all possible classes.
436 For testing purposes. There is no need to use this, since you might as
437 well use full softmax or full logistic regression.
439 Args:
440 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
441 num_true]`. The target classes.
442 num_true: An `int`. The number of target classes per training example.
443 num_sampled: An `int`. The number of possible classes.
444 unique: A `bool`. Ignored.
445 unique.
446 seed: An `int`. An operation-specific seed. Default is 0.
447 name: A name for the operation (optional).
449 Returns:
450 sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
451 This operation deterministically returns the entire range
452 `[0, num_sampled]`.
453 true_expected_count: A tensor of type `float`. Same shape as
454 `true_classes`. The expected counts under the sampling distribution
455 of each of `true_classes`. All returned values are 1.0.
456 sampled_expected_count: A tensor of type `float`. Same shape as
457 `sampled_candidates`. The expected counts under the sampling distribution
458 of each of `sampled_candidates`. All returned values are 1.0.
459 """
460 seed1, seed2 = random_seed.get_seed(seed)
461 return gen_candidate_sampling_ops.all_candidate_sampler(
462 true_classes, num_true, num_sampled, unique, seed=seed1, seed2=seed2,
463 name=name)
466@tf_export('nn.compute_accidental_hits')
467@dispatch.add_dispatch_support
468def compute_accidental_hits(true_classes, sampled_candidates, num_true,
469 seed=None, name=None):
470 """Compute the position ids in `sampled_candidates` matching `true_classes`.
472 In Candidate Sampling, this operation facilitates virtually removing
473 sampled classes which happen to match target classes. This is done
474 in Sampled Softmax and Sampled Logistic.
476 See our [Candidate Sampling Algorithms
477 Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
479 We presuppose that the `sampled_candidates` are unique.
481 We call it an 'accidental hit' when one of the target classes
482 matches one of the sampled classes. This operation reports
483 accidental hits as triples `(index, id, weight)`, where `index`
484 represents the row number in `true_classes`, `id` represents the
485 position in `sampled_candidates`, and weight is `-FLOAT_MAX`.
487 The result of this op should be passed through a `sparse_to_dense`
488 operation, then added to the logits of the sampled classes. This
489 removes the contradictory effect of accidentally sampling the true
490 target classes as noise classes for the same example.
492 Args:
493 true_classes: A `Tensor` of type `int64` and shape `[batch_size,
494 num_true]`. The target classes.
495 sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
496 The sampled_candidates output of CandidateSampler.
497 num_true: An `int`. The number of target classes per training example.
498 seed: An `int`. An operation-specific seed. Default is 0.
499 name: A name for the operation (optional).
501 Returns:
502 indices: A `Tensor` of type `int32` and shape `[num_accidental_hits]`.
503 Values indicate rows in `true_classes`.
504 ids: A `Tensor` of type `int64` and shape `[num_accidental_hits]`.
505 Values indicate positions in `sampled_candidates`.
506 weights: A `Tensor` of type `float` and shape `[num_accidental_hits]`.
507 Each value is `-FLOAT_MAX`.
509 """
510 seed1, seed2 = random_seed.get_seed(seed)
511 return gen_candidate_sampling_ops.compute_accidental_hits(
512 true_classes, sampled_candidates, num_true, seed=seed1, seed2=seed2,
513 name=name)