Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/checkpoint

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Operations for generating and loading vocab remappings."""

16import math

18from tensorflow.python.framework import dtypes

19from tensorflow.python.framework import ops

20from tensorflow.python.ops import array_ops

21from tensorflow.python.ops import gen_checkpoint_ops

22from tensorflow.python.ops import init_ops

23from tensorflow.python.ops import math_ops

25ops.NotDifferentiable("GenerateVocabRemapping")

26ops.NotDifferentiable("LoadAndRemapMatrix")

29def _load_and_remap_matrix(ckpt_path,

30 old_tensor_name,

31 new_row_vocab_offset,

32 num_rows_to_load,

33 new_col_vocab_size,

34 initializer,

35 old_row_vocab_size=-1,

36 old_row_vocab_file=None,

37 new_row_vocab_file=None,

38 old_col_vocab_file=None,

39 new_col_vocab_file=None,

40 num_row_oov_buckets=0,

41 num_col_oov_buckets=0,

42 max_rows_in_memory=-1):

43 """Loads a 2-D (matrix) `Tensor` from checkpoint.

45 Generates 1D-remappings for rows and columns using the

46 `GenerateVocabRemapping` op, and initializes any anticipated values with the

47 provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a

48 matrix that loads existing values from the checkpoint, while filling out

49 "missing" values with the newly initialized values. See

50 contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped

51 functionality (LoadAndRemapMatrix). This wrapper can be used to perform only

52 row remapping or only col remapping. If only row remapping is desired,

53 {new,old}_col_vocab_file should be `None`, and vice versa for column

54 remapping.

56 NOTE: This only supports div-partitioning the vocabulary on the 1st dimension

57 (row axis) via `new_row_vocab_offset`.

59 Args:

60 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)

61 from which the old matrix `Tensor` will be loaded.

62 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.

63 new_row_vocab_offset: A 0-indexed integer representing what line to

64 start reading at in the new row vocabulary. Used for partitioned

65 variables.

66 num_rows_to_load: Number of rows to load for the new vocabulary (note: to

67 support variable partitioning and partial loading, this does not need to

68 be the same as the number of entries in `new_row_vocab_file`).

69 new_col_vocab_size: Number of columns to load - should be the same as the

70 number of entries in `new_col_vocab_file`, since we don't support

71 partitioning along the column axis.

72 initializer: Callable initializer function that accepts a 1-D tensor as the

73 arg to specify the shape of the returned tensor. Used to initialize

74 missing values.

75 old_row_vocab_size: The number of entries to consider in the old vocabulary.

76 With the default value of -1, the entire old row vocabulary file will be

77 used. Otherwise, only the first `old_row_vocab_size` entries will be

78 considered for remapping.Must be smaller than the length of

79 `old_row_vocab_file`. NOTE: we do not provide an equivalent

80 `old_col_vocab_size` for classes.

81 old_row_vocab_file: A scalar `Tensor` of type `string` containing the

82 path to the old row vocabulary file. Can be None, which represents no

83 remapping on the row axis.

84 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path

85 to the new row vocabulary file. Can be None, which represents no remapping

86 on the row axis - in which case, `new_row_vocab_offset` and

87 `num_rows_to_load` work under the assumption that the new row vocab is the

88 same as the old row vocab.

89 old_col_vocab_file: A scalar `Tensor` of type `string` containing the

90 path to the old column vocabulary file. Can be None, which represents no

91 remapping on the column axis.

92 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path

93 to the new column vocabulary file. Can be None, which represents no

94 remapping on the column axis - in which case, `new_col_vocab_size` works

95 under the assumption that the new col vocab is the same as the old col

96 vocab.

97 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows

98 to append. Must be >= 0.

99 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary

100 columns to append. Must be >= 0.

101 max_rows_in_memory: `int` specifying the maximum number of rows to load from

102 the checkpoint at once. If less than or equal to 0, the entire matrix will

103 be loaded into memory. Setting this arg trades increased disk reads for

104 lower memory usage.

105

106 Returns:

107 A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,

108 new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the

109 specified tensor in the checkpoint, and any missing or OOV values

110 initialized with the given `initializer`.

111

112 Raises:

113 ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.

114 ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is

115 provided, while the other is not. Same for `old_col_vocab_file` and

116 `new_col_vocab_file`.

117 ValueError: If neither row vocabs or col vocabs are provided.

118 """

119 if num_row_oov_buckets < 0:

120 raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %

121 num_row_oov_buckets)

122 if num_col_oov_buckets < 0:

123 raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %

124 num_col_oov_buckets)

125

126 if bool(old_row_vocab_file) != bool(new_row_vocab_file):

127 raise ValueError(

128 "old_row_vocab_file and new_row_vocab_file must both be specified or "

129 "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".

130 format(old_row_vocab_file, new_row_vocab_file))

131 if bool(old_col_vocab_file) != bool(new_col_vocab_file):

132 raise ValueError(

133 "old_col_vocab_file and new_col_vocab_file must both be specified or "

134 "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".

135 format(old_col_vocab_file, new_col_vocab_file))

136

137 remap_rows = new_row_vocab_file and old_row_vocab_file

138 remap_cols = new_col_vocab_file and old_col_vocab_file

139 if not (remap_rows or remap_cols):

140 raise ValueError(

141 "Must provide either row or column vocab files. If no remapping is "

142 "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "

143 "instead.")

144

145 num_rows_present = num_rows_to_load

146 if remap_rows:

147 row_remapping, num_rows_present = (

148 gen_checkpoint_ops.generate_vocab_remapping(

149 new_vocab_file=new_row_vocab_file,

150 old_vocab_file=old_row_vocab_file,

151 new_vocab_offset=new_row_vocab_offset,

152 num_new_vocab=num_rows_to_load,

153 old_vocab_size=old_row_vocab_size))

154 else:

155 # Even when the rows are not being reordered, we still need to generate a

156 # remapping to account for initializing partitioned Variables (when

157 # new_row_vocab_offset is non-zero).

158 row_remapping = math_ops.range(

159 new_row_vocab_offset,

160 new_row_vocab_offset + num_rows_to_load,

161 dtype=dtypes.int64)

162

163 col_remapping = []

164 num_cols_present = new_col_vocab_size

165 if remap_cols:

166 col_remapping, num_cols_present = (

167 gen_checkpoint_ops.generate_vocab_remapping(

168 new_vocab_file=new_col_vocab_file,

169 old_vocab_file=old_col_vocab_file,

170 new_vocab_offset=0, # Offset is unused for cols (no partitioning).

171 num_new_vocab=new_col_vocab_size))

172

173 init_vals = initializer([

174 num_rows_to_load * new_col_vocab_size -

175 num_rows_present * num_cols_present, 1

176 ])

177 return_tensor = gen_checkpoint_ops.load_and_remap_matrix(

178 ckpt_path=ckpt_path,

179 old_tensor_name=old_tensor_name,

180 row_remapping=row_remapping,

181 col_remapping=col_remapping,

182 initializing_values=init_vals,

183 num_rows=num_rows_to_load,

184 num_cols=new_col_vocab_size,

185 max_rows_in_memory=max_rows_in_memory)

186

187 # Add OOV row(s) and column(s).

188 if num_row_oov_buckets > 0:

189 init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])

190 init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)

191 return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)

192 if num_col_oov_buckets > 0:

193 # We need to add any row OOV to the new column shape.

194 init_col_oov_val = initializer(

195 [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])

196 init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)

197 return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)

198

199 return return_tensor

200

201

202def _load_and_remap_matrix_initializer(ckpt_path,

203 old_tensor_name,

204 new_row_vocab_size,

205 new_col_vocab_size,

206 old_row_vocab_size=-1,

207 old_row_vocab_file=None,

208 new_row_vocab_file=None,

209 old_col_vocab_file=None,

210 new_col_vocab_file=None,

211 num_row_oov_buckets=0,

212 num_col_oov_buckets=0,

213 initializer=None,

214 max_rows_in_memory=-1):

215 r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.

216

217 The returned initializer loads a 2-D (matrix) `Tensor` with name

218 `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the

219 rows/columns according to the specified vocab files and append additional

220 out-of-vocabulary rows/columns according to the number of OOV buckets.

221

222 The format of the file at the `{old,new}_{row,col}_vocab_file` path should be

223 a text file, with each line containing a single entity within the vocabulary.

224 Let the function `line_of(f, "x")` return the 0-indexed line number of the

225 entity "x" in file f, and the function `entity_at(f, i)` return the entity at

226 line i of file f. Then, row i of the new output matrix will be taken from row

227 `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old

228 matrix. If any entity in `new_row_vocab_file` is not found in

229 `old_row_vocab_file`, that row is considered a "missing" row, and its values

230 will be initialized using the `initializer` arg. The same logic also applies

231 for the columns.

232

233 For example, assuming that:

234

235 * `old_row_vocab_file` contains "mercury\nvenus\nmars"

236 * `new_row_vocab_file` contains "venus\njupiter\nmercury"

237 * `old_col_vocab_file` contains "good\nbetter\nbest"

238 * `new_col_vocab_file` contains "good\nbest\nfantastic"

239 * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`

240 * `w(i, j)` represents the value from row i, column j of the old matrix

241

242 Then the new output matrix will look like:

243

244 `[[w(1, 0), w(1, 2), 1],

245 [2, 3, 4],

246 [w(0, 0), w(0, 2), 5]]`

247

248 If we further specify that:

249

250 * `num_row_oov_buckets` == 2

251 * `num_col_oov_buckets` == 1

252

253 Then the new output matrix will look like:

254

255 `[[w(1, 0), w(1, 2), 1, 12],

256 [2, 3, 4, 13],

257 [w(0, 0), w(0, 2), 5, 14],

258 [6, 7, 8, 15],

259 [9, 10, 11, 16]]`

260

261 If `{old,new}_row_vocab_file` are None, we assume that the old and new row

262 vocab files are the same, and no row remapping is done. If

263 `{old,new}_col_vocab_file` are None, we assume that the old and new column

264 vocab files are the same, and no column remapping is done.

265

266 The returned initializer only supports div-partitioning along the row axis. It

267 does not support partitioning along the column axis (as this is not common in

268 practice) or mod-partitioning.

269

270 NOTE: When this is used to warm-start variables, client code should use

271 `tf.lookup.index_table_from_tensor()` like

272 contrib/layers/python/layers/feature_column.py does, as opposed to

273 `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the

274 same.

275

276 Args:

277 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)

278 from which the old matrix `Tensor` will be loaded.

279 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.

280 new_row_vocab_size: `int` specifying the number of entries in

281 `new_row_vocab_file`. If no row remapping is needed (no row vocab

282 provided), this should be equal to the number of rows to load from the old

283 matrix (which can theoretically be smaller than the number of rows in the

284 old matrix).

285 new_col_vocab_size: `int` specifying the number of entries in

286 `new_col_vocab_file`. If no column remapping is needed (no column vocab

287 provided), this should be equal to the number of columns in the old

288 matrix.

289 old_row_vocab_size: The number of entries to consider in the old vocabulary.

290 With the default value of -1, the entire old row vocabulary file will be

291 used. Otherwise, only the first `old_row_vocab_size` entries will be

292 considered for remapping.Must be smaller than the length of

293 `old_row_vocab_file`. NOTE: we do not provide an equivalent

294 `old_col_vocab_size` for classes.

295 old_row_vocab_file: A scalar `Tensor` of type `string` containing the

296 path to the old row vocabulary file. Can be None, which represents no

297 remapping on the row axis.

298 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path

299 to the new row vocabulary file. Can be None, which represents no remapping

300 on the row axis.

301 old_col_vocab_file: A scalar `Tensor` of type `string` containing the

302 path to the old column vocabulary file. Can be None, which represents no

303 remapping on the column axis.

304 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path

305 to the new column vocabulary file. Can be None, which represents no

306 remapping on the column axis.

307 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows

308 to append. Must be >= 0.

309 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary

310 columns to append. Must be >= 0.

311 initializer: Initializer function to initialize missing values. Accepts a

312 1-D tensor as the arg to specify the shape of the returned tensor. If

313 `None`, defaults to using `zeros_initializer()`.

314 max_rows_in_memory: `int` specifying the maximum number of rows to load from

315 the checkpoint at once. If less than or equal to 0, the entire matrix will

316 be loaded into memory. Setting this arg trades increased disk reads for

317 lower memory usage.

318

319 Returns:

320 A variable initializer function that should be used to initialize a

321 (potentially partitioned) `Variable` whose complete shape is

322 `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +

323 num_col_oov_buckets]`.

324

325 Raises:

326 TypeError: If `initializer` is specified but not callable.

327 """

328 if initializer is None:

329 # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from

330 # Glorot and Bengio, 2010.

331 initializer = init_ops.zeros_initializer()

332

333 if not callable(initializer):

334 raise TypeError(

335 "initializer must be callable, instead of being {} of type {}.".format(

336 initializer, type(initializer)))

337

338 def _initializer(shape, dtype=dtypes.float32, partition_info=None):

339 """Variable initializer.

340

341 Args:

342 shape: Shape of `Tensor` to return. Should include OOV on both axes.

343 dtype: Must be float32.

344 partition_info: variable_scope._PartitionInfo.

345

346 Returns:

347 `Tensor` of shape `shape`.

348

349 Raises:

350 TypeError: If `dtype` is anything other than float32.

351 ValueError: For shape mismatch upon invocation.

352 """

353 # Sanity checks.

354 if dtype != dtypes.float32:

355 raise TypeError(

356 "Currently, only float32 is supported. Received dtype: {}".format(

357 dtype))

358 if len(shape) != 2:

359 raise ValueError("Expected 2-dim shape, but received: {}".format(shape))

360 if shape[0] <= 0:

361 raise ValueError(

362 "Expected 1st dim of shape to be > 0, but received shape: {}".format(

363 shape))

364 if shape[1] != (new_col_vocab_size + num_col_oov_buckets):

365 raise ValueError(

366 "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "

367 "num_col_oov_buckets ({}) = {}, but received shape: {}".format(

368 new_col_vocab_size, num_col_oov_buckets,

369 new_col_vocab_size + num_col_oov_buckets, shape))

370

371 offset = 0

372 if partition_info is not None:

373 offset = partition_info.single_offset(shape)

374

375 if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:

376 raise ValueError(

377 "Trying to initialize {} additional rows after {} rows have already "

378 "been initialized, which would exceed expected total row count of "

379 "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(

380 shape[0], offset, new_row_vocab_size, num_row_oov_buckets,

381 new_row_vocab_size + num_row_oov_buckets))

382

383 row_oov_buckets_to_use = min(shape[0],

384 max(0, offset + shape[0] - new_row_vocab_size))

385 num_rows_to_load = shape[0] - row_oov_buckets_to_use

386

387 # We may be operating on an OOV-only partition, in which case we newly

388 # initialize all rows of this partition.

389 if offset > new_row_vocab_size:

390 if shape[0] != row_oov_buckets_to_use:

391 raise ValueError(

392 "Partitioned variable offset is greater than new vocab size and "

393 "not operating on OOV-only partition.")

394 return initializer(shape)

395

396 return _load_and_remap_matrix(

397 ckpt_path=ckpt_path,

398 old_tensor_name=old_tensor_name,

399 new_row_vocab_offset=offset,

400 num_rows_to_load=num_rows_to_load,

401 new_col_vocab_size=new_col_vocab_size,

402 initializer=initializer,

403 old_row_vocab_size=old_row_vocab_size,

404 old_row_vocab_file=old_row_vocab_file,

405 new_row_vocab_file=new_row_vocab_file,

406 old_col_vocab_file=old_col_vocab_file,

407 new_col_vocab_file=new_col_vocab_file,

408 num_row_oov_buckets=row_oov_buckets_to_use,

409 num_col_oov_buckets=num_col_oov_buckets,

410 max_rows_in_memory=max_rows_in_memory)

411

412 return _initializer

413

414

415def _load_embedding_initializer(ckpt_path,

416 embedding_tensor_name,

417 new_vocab_size,

418 embedding_dim,

419 old_vocab_file,

420 new_vocab_file,

421 old_vocab_size=-1,

422 num_oov_buckets=0,

423 initializer=None,

424 max_rows_in_memory=-1):

425 """Returns a variable initializer for loading pre-trained embeddings.

426

427 Wrapper around `load_and_remap_matrix_initializer()` specialized for loading

428 embedding weights and remapping according to the provided vocab files. See

429 docs for `load_and_remap_matrix_initializer()` for more details.

430

431 NOTE: Only for use with div-partitioned variables / vocabularies.

432

433 Args:

434 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)

435 from which the old matrix `Tensor` will be loaded.

436 embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.

437 new_vocab_size: Number of entries in the new vocab.

438 embedding_dim: `int` specifying the dimension of the embedding vectors from

439 the checkpoint. Must match the number of columns in the old embedding

440 matrix.

441 old_vocab_file: A scalar `Tensor` of type `string` containing the

442 path to the old vocabulary file.

443 new_vocab_file: A scalar `Tensor` of type `string` containing the

444 path to the new vocabulary file.

445 old_vocab_size: The number of entries to consider in the old vocabulary.

446 With the default value of -1, the entire old row vocabulary file will be

447 used. Otherwise, only the first `old_vocab_size` entries will be

448 considered for remapping.Must be smaller than the length of

449 `old_row_vocab_file`.

450 num_oov_buckets: `int` specifying the number of out-of-vocabulary

451 buckets to use. Must be >= 0.

452 initializer: Initializer function that accepts a 1-D tensor as the arg to

453 specify the shape of the returned tensor. If `None`, defaults to using

454 `truncated_normal_initializer()`.

455 max_rows_in_memory: `int` specifying the maximum number of rows to load from

456 the checkpoint at once. If less than or equal to 0, the entire matrix will

457 be loaded into memory. Setting this arg trades increased disk reads for

458 lower memory usage.

459

460 Returns:

461 A variable initializer function.

462 """

463 if initializer is None:

464 # TODO(b/25671353): This should be kept in sync with the stddev used by

465 # feature_column.py's _EmbeddingColumn.

466 initializer = init_ops.truncated_normal_initializer(

467 stddev=1.0 / math.sqrt(embedding_dim))

468

469 return _load_and_remap_matrix_initializer(

470 ckpt_path=ckpt_path,

471 old_tensor_name=embedding_tensor_name,

472 new_row_vocab_size=new_vocab_size,

473 new_col_vocab_size=embedding_dim,

474 old_row_vocab_size=old_vocab_size,

475 old_row_vocab_file=old_vocab_file,

476 new_row_vocab_file=new_vocab_file,

477 old_col_vocab_file=None,

478 new_col_vocab_file=None,

479 num_row_oov_buckets=num_oov_buckets,

480 num_col_oov_buckets=0,

481 initializer=initializer,

482 max_rows_in_memory=max_rows_in_memory)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/checkpoint_ops.py: 18%

73 statements