Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/checkpoint_ops.py: 18%
73 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Operations for generating and loading vocab remappings."""
16import math
18from tensorflow.python.framework import dtypes
19from tensorflow.python.framework import ops
20from tensorflow.python.ops import array_ops
21from tensorflow.python.ops import gen_checkpoint_ops
22from tensorflow.python.ops import init_ops
23from tensorflow.python.ops import math_ops
25ops.NotDifferentiable("GenerateVocabRemapping")
26ops.NotDifferentiable("LoadAndRemapMatrix")
29def _load_and_remap_matrix(ckpt_path,
30 old_tensor_name,
31 new_row_vocab_offset,
32 num_rows_to_load,
33 new_col_vocab_size,
34 initializer,
35 old_row_vocab_size=-1,
36 old_row_vocab_file=None,
37 new_row_vocab_file=None,
38 old_col_vocab_file=None,
39 new_col_vocab_file=None,
40 num_row_oov_buckets=0,
41 num_col_oov_buckets=0,
42 max_rows_in_memory=-1):
43 """Loads a 2-D (matrix) `Tensor` from checkpoint.
45 Generates 1D-remappings for rows and columns using the
46 `GenerateVocabRemapping` op, and initializes any anticipated values with the
47 provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
48 matrix that loads existing values from the checkpoint, while filling out
49 "missing" values with the newly initialized values. See
50 contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
51 functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
52 row remapping or only col remapping. If only row remapping is desired,
53 {new,old}_col_vocab_file should be `None`, and vice versa for column
54 remapping.
56 NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
57 (row axis) via `new_row_vocab_offset`.
59 Args:
60 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
61 from which the old matrix `Tensor` will be loaded.
62 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
63 new_row_vocab_offset: A 0-indexed integer representing what line to
64 start reading at in the new row vocabulary. Used for partitioned
65 variables.
66 num_rows_to_load: Number of rows to load for the new vocabulary (note: to
67 support variable partitioning and partial loading, this does not need to
68 be the same as the number of entries in `new_row_vocab_file`).
69 new_col_vocab_size: Number of columns to load - should be the same as the
70 number of entries in `new_col_vocab_file`, since we don't support
71 partitioning along the column axis.
72 initializer: Callable initializer function that accepts a 1-D tensor as the
73 arg to specify the shape of the returned tensor. Used to initialize
74 missing values.
75 old_row_vocab_size: The number of entries to consider in the old vocabulary.
76 With the default value of -1, the entire old row vocabulary file will be
77 used. Otherwise, only the first `old_row_vocab_size` entries will be
78 considered for remapping.Must be smaller than the length of
79 `old_row_vocab_file`. NOTE: we do not provide an equivalent
80 `old_col_vocab_size` for classes.
81 old_row_vocab_file: A scalar `Tensor` of type `string` containing the
82 path to the old row vocabulary file. Can be None, which represents no
83 remapping on the row axis.
84 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
85 to the new row vocabulary file. Can be None, which represents no remapping
86 on the row axis - in which case, `new_row_vocab_offset` and
87 `num_rows_to_load` work under the assumption that the new row vocab is the
88 same as the old row vocab.
89 old_col_vocab_file: A scalar `Tensor` of type `string` containing the
90 path to the old column vocabulary file. Can be None, which represents no
91 remapping on the column axis.
92 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
93 to the new column vocabulary file. Can be None, which represents no
94 remapping on the column axis - in which case, `new_col_vocab_size` works
95 under the assumption that the new col vocab is the same as the old col
96 vocab.
97 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
98 to append. Must be >= 0.
99 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
100 columns to append. Must be >= 0.
101 max_rows_in_memory: `int` specifying the maximum number of rows to load from
102 the checkpoint at once. If less than or equal to 0, the entire matrix will
103 be loaded into memory. Setting this arg trades increased disk reads for
104 lower memory usage.
106 Returns:
107 A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
108 new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
109 specified tensor in the checkpoint, and any missing or OOV values
110 initialized with the given `initializer`.
112 Raises:
113 ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
114 ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
115 provided, while the other is not. Same for `old_col_vocab_file` and
116 `new_col_vocab_file`.
117 ValueError: If neither row vocabs or col vocabs are provided.
118 """
119 if num_row_oov_buckets < 0:
120 raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
121 num_row_oov_buckets)
122 if num_col_oov_buckets < 0:
123 raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
124 num_col_oov_buckets)
126 if bool(old_row_vocab_file) != bool(new_row_vocab_file):
127 raise ValueError(
128 "old_row_vocab_file and new_row_vocab_file must both be specified or "
129 "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
130 format(old_row_vocab_file, new_row_vocab_file))
131 if bool(old_col_vocab_file) != bool(new_col_vocab_file):
132 raise ValueError(
133 "old_col_vocab_file and new_col_vocab_file must both be specified or "
134 "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
135 format(old_col_vocab_file, new_col_vocab_file))
137 remap_rows = new_row_vocab_file and old_row_vocab_file
138 remap_cols = new_col_vocab_file and old_col_vocab_file
139 if not (remap_rows or remap_cols):
140 raise ValueError(
141 "Must provide either row or column vocab files. If no remapping is "
142 "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
143 "instead.")
145 num_rows_present = num_rows_to_load
146 if remap_rows:
147 row_remapping, num_rows_present = (
148 gen_checkpoint_ops.generate_vocab_remapping(
149 new_vocab_file=new_row_vocab_file,
150 old_vocab_file=old_row_vocab_file,
151 new_vocab_offset=new_row_vocab_offset,
152 num_new_vocab=num_rows_to_load,
153 old_vocab_size=old_row_vocab_size))
154 else:
155 # Even when the rows are not being reordered, we still need to generate a
156 # remapping to account for initializing partitioned Variables (when
157 # new_row_vocab_offset is non-zero).
158 row_remapping = math_ops.range(
159 new_row_vocab_offset,
160 new_row_vocab_offset + num_rows_to_load,
161 dtype=dtypes.int64)
163 col_remapping = []
164 num_cols_present = new_col_vocab_size
165 if remap_cols:
166 col_remapping, num_cols_present = (
167 gen_checkpoint_ops.generate_vocab_remapping(
168 new_vocab_file=new_col_vocab_file,
169 old_vocab_file=old_col_vocab_file,
170 new_vocab_offset=0, # Offset is unused for cols (no partitioning).
171 num_new_vocab=new_col_vocab_size))
173 init_vals = initializer([
174 num_rows_to_load * new_col_vocab_size -
175 num_rows_present * num_cols_present, 1
176 ])
177 return_tensor = gen_checkpoint_ops.load_and_remap_matrix(
178 ckpt_path=ckpt_path,
179 old_tensor_name=old_tensor_name,
180 row_remapping=row_remapping,
181 col_remapping=col_remapping,
182 initializing_values=init_vals,
183 num_rows=num_rows_to_load,
184 num_cols=new_col_vocab_size,
185 max_rows_in_memory=max_rows_in_memory)
187 # Add OOV row(s) and column(s).
188 if num_row_oov_buckets > 0:
189 init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
190 init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
191 return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
192 if num_col_oov_buckets > 0:
193 # We need to add any row OOV to the new column shape.
194 init_col_oov_val = initializer(
195 [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
196 init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
197 return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
199 return return_tensor
202def _load_and_remap_matrix_initializer(ckpt_path,
203 old_tensor_name,
204 new_row_vocab_size,
205 new_col_vocab_size,
206 old_row_vocab_size=-1,
207 old_row_vocab_file=None,
208 new_row_vocab_file=None,
209 old_col_vocab_file=None,
210 new_col_vocab_file=None,
211 num_row_oov_buckets=0,
212 num_col_oov_buckets=0,
213 initializer=None,
214 max_rows_in_memory=-1):
215 r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
217 The returned initializer loads a 2-D (matrix) `Tensor` with name
218 `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
219 rows/columns according to the specified vocab files and append additional
220 out-of-vocabulary rows/columns according to the number of OOV buckets.
222 The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
223 a text file, with each line containing a single entity within the vocabulary.
224 Let the function `line_of(f, "x")` return the 0-indexed line number of the
225 entity "x" in file f, and the function `entity_at(f, i)` return the entity at
226 line i of file f. Then, row i of the new output matrix will be taken from row
227 `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
228 matrix. If any entity in `new_row_vocab_file` is not found in
229 `old_row_vocab_file`, that row is considered a "missing" row, and its values
230 will be initialized using the `initializer` arg. The same logic also applies
231 for the columns.
233 For example, assuming that:
235 * `old_row_vocab_file` contains "mercury\nvenus\nmars"
236 * `new_row_vocab_file` contains "venus\njupiter\nmercury"
237 * `old_col_vocab_file` contains "good\nbetter\nbest"
238 * `new_col_vocab_file` contains "good\nbest\nfantastic"
239 * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
240 * `w(i, j)` represents the value from row i, column j of the old matrix
242 Then the new output matrix will look like:
244 `[[w(1, 0), w(1, 2), 1],
245 [2, 3, 4],
246 [w(0, 0), w(0, 2), 5]]`
248 If we further specify that:
250 * `num_row_oov_buckets` == 2
251 * `num_col_oov_buckets` == 1
253 Then the new output matrix will look like:
255 `[[w(1, 0), w(1, 2), 1, 12],
256 [2, 3, 4, 13],
257 [w(0, 0), w(0, 2), 5, 14],
258 [6, 7, 8, 15],
259 [9, 10, 11, 16]]`
261 If `{old,new}_row_vocab_file` are None, we assume that the old and new row
262 vocab files are the same, and no row remapping is done. If
263 `{old,new}_col_vocab_file` are None, we assume that the old and new column
264 vocab files are the same, and no column remapping is done.
266 The returned initializer only supports div-partitioning along the row axis. It
267 does not support partitioning along the column axis (as this is not common in
268 practice) or mod-partitioning.
270 NOTE: When this is used to warm-start variables, client code should use
271 `tf.lookup.index_table_from_tensor()` like
272 contrib/layers/python/layers/feature_column.py does, as opposed to
273 `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
274 same.
276 Args:
277 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
278 from which the old matrix `Tensor` will be loaded.
279 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
280 new_row_vocab_size: `int` specifying the number of entries in
281 `new_row_vocab_file`. If no row remapping is needed (no row vocab
282 provided), this should be equal to the number of rows to load from the old
283 matrix (which can theoretically be smaller than the number of rows in the
284 old matrix).
285 new_col_vocab_size: `int` specifying the number of entries in
286 `new_col_vocab_file`. If no column remapping is needed (no column vocab
287 provided), this should be equal to the number of columns in the old
288 matrix.
289 old_row_vocab_size: The number of entries to consider in the old vocabulary.
290 With the default value of -1, the entire old row vocabulary file will be
291 used. Otherwise, only the first `old_row_vocab_size` entries will be
292 considered for remapping.Must be smaller than the length of
293 `old_row_vocab_file`. NOTE: we do not provide an equivalent
294 `old_col_vocab_size` for classes.
295 old_row_vocab_file: A scalar `Tensor` of type `string` containing the
296 path to the old row vocabulary file. Can be None, which represents no
297 remapping on the row axis.
298 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
299 to the new row vocabulary file. Can be None, which represents no remapping
300 on the row axis.
301 old_col_vocab_file: A scalar `Tensor` of type `string` containing the
302 path to the old column vocabulary file. Can be None, which represents no
303 remapping on the column axis.
304 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
305 to the new column vocabulary file. Can be None, which represents no
306 remapping on the column axis.
307 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
308 to append. Must be >= 0.
309 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
310 columns to append. Must be >= 0.
311 initializer: Initializer function to initialize missing values. Accepts a
312 1-D tensor as the arg to specify the shape of the returned tensor. If
313 `None`, defaults to using `zeros_initializer()`.
314 max_rows_in_memory: `int` specifying the maximum number of rows to load from
315 the checkpoint at once. If less than or equal to 0, the entire matrix will
316 be loaded into memory. Setting this arg trades increased disk reads for
317 lower memory usage.
319 Returns:
320 A variable initializer function that should be used to initialize a
321 (potentially partitioned) `Variable` whose complete shape is
322 `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
323 num_col_oov_buckets]`.
325 Raises:
326 TypeError: If `initializer` is specified but not callable.
327 """
328 if initializer is None:
329 # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
330 # Glorot and Bengio, 2010.
331 initializer = init_ops.zeros_initializer()
333 if not callable(initializer):
334 raise TypeError(
335 "initializer must be callable, instead of being {} of type {}.".format(
336 initializer, type(initializer)))
338 def _initializer(shape, dtype=dtypes.float32, partition_info=None):
339 """Variable initializer.
341 Args:
342 shape: Shape of `Tensor` to return. Should include OOV on both axes.
343 dtype: Must be float32.
344 partition_info: variable_scope._PartitionInfo.
346 Returns:
347 `Tensor` of shape `shape`.
349 Raises:
350 TypeError: If `dtype` is anything other than float32.
351 ValueError: For shape mismatch upon invocation.
352 """
353 # Sanity checks.
354 if dtype != dtypes.float32:
355 raise TypeError(
356 "Currently, only float32 is supported. Received dtype: {}".format(
357 dtype))
358 if len(shape) != 2:
359 raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
360 if shape[0] <= 0:
361 raise ValueError(
362 "Expected 1st dim of shape to be > 0, but received shape: {}".format(
363 shape))
364 if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
365 raise ValueError(
366 "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
367 "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
368 new_col_vocab_size, num_col_oov_buckets,
369 new_col_vocab_size + num_col_oov_buckets, shape))
371 offset = 0
372 if partition_info is not None:
373 offset = partition_info.single_offset(shape)
375 if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
376 raise ValueError(
377 "Trying to initialize {} additional rows after {} rows have already "
378 "been initialized, which would exceed expected total row count of "
379 "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
380 shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
381 new_row_vocab_size + num_row_oov_buckets))
383 row_oov_buckets_to_use = min(shape[0],
384 max(0, offset + shape[0] - new_row_vocab_size))
385 num_rows_to_load = shape[0] - row_oov_buckets_to_use
387 # We may be operating on an OOV-only partition, in which case we newly
388 # initialize all rows of this partition.
389 if offset > new_row_vocab_size:
390 if shape[0] != row_oov_buckets_to_use:
391 raise ValueError(
392 "Partitioned variable offset is greater than new vocab size and "
393 "not operating on OOV-only partition.")
394 return initializer(shape)
396 return _load_and_remap_matrix(
397 ckpt_path=ckpt_path,
398 old_tensor_name=old_tensor_name,
399 new_row_vocab_offset=offset,
400 num_rows_to_load=num_rows_to_load,
401 new_col_vocab_size=new_col_vocab_size,
402 initializer=initializer,
403 old_row_vocab_size=old_row_vocab_size,
404 old_row_vocab_file=old_row_vocab_file,
405 new_row_vocab_file=new_row_vocab_file,
406 old_col_vocab_file=old_col_vocab_file,
407 new_col_vocab_file=new_col_vocab_file,
408 num_row_oov_buckets=row_oov_buckets_to_use,
409 num_col_oov_buckets=num_col_oov_buckets,
410 max_rows_in_memory=max_rows_in_memory)
412 return _initializer
415def _load_embedding_initializer(ckpt_path,
416 embedding_tensor_name,
417 new_vocab_size,
418 embedding_dim,
419 old_vocab_file,
420 new_vocab_file,
421 old_vocab_size=-1,
422 num_oov_buckets=0,
423 initializer=None,
424 max_rows_in_memory=-1):
425 """Returns a variable initializer for loading pre-trained embeddings.
427 Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
428 embedding weights and remapping according to the provided vocab files. See
429 docs for `load_and_remap_matrix_initializer()` for more details.
431 NOTE: Only for use with div-partitioned variables / vocabularies.
433 Args:
434 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
435 from which the old matrix `Tensor` will be loaded.
436 embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
437 new_vocab_size: Number of entries in the new vocab.
438 embedding_dim: `int` specifying the dimension of the embedding vectors from
439 the checkpoint. Must match the number of columns in the old embedding
440 matrix.
441 old_vocab_file: A scalar `Tensor` of type `string` containing the
442 path to the old vocabulary file.
443 new_vocab_file: A scalar `Tensor` of type `string` containing the
444 path to the new vocabulary file.
445 old_vocab_size: The number of entries to consider in the old vocabulary.
446 With the default value of -1, the entire old row vocabulary file will be
447 used. Otherwise, only the first `old_vocab_size` entries will be
448 considered for remapping.Must be smaller than the length of
449 `old_row_vocab_file`.
450 num_oov_buckets: `int` specifying the number of out-of-vocabulary
451 buckets to use. Must be >= 0.
452 initializer: Initializer function that accepts a 1-D tensor as the arg to
453 specify the shape of the returned tensor. If `None`, defaults to using
454 `truncated_normal_initializer()`.
455 max_rows_in_memory: `int` specifying the maximum number of rows to load from
456 the checkpoint at once. If less than or equal to 0, the entire matrix will
457 be loaded into memory. Setting this arg trades increased disk reads for
458 lower memory usage.
460 Returns:
461 A variable initializer function.
462 """
463 if initializer is None:
464 # TODO(b/25671353): This should be kept in sync with the stddev used by
465 # feature_column.py's _EmbeddingColumn.
466 initializer = init_ops.truncated_normal_initializer(
467 stddev=1.0 / math.sqrt(embedding_dim))
469 return _load_and_remap_matrix_initializer(
470 ckpt_path=ckpt_path,
471 old_tensor_name=embedding_tensor_name,
472 new_row_vocab_size=new_vocab_size,
473 new_col_vocab_size=embedding_dim,
474 old_row_vocab_size=old_vocab_size,
475 old_row_vocab_file=old_vocab_file,
476 new_row_vocab_file=new_vocab_file,
477 old_col_vocab_file=None,
478 new_col_vocab_file=None,
479 num_row_oov_buckets=num_oov_buckets,
480 num_col_oov_buckets=0,
481 initializer=initializer,
482 max_rows_in_memory=max_rows_in_memory)