1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Operations for generating and loading vocab remappings.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20import math 21 22from tensorflow.python.framework import dtypes 23from tensorflow.python.framework import ops 24from tensorflow.python.ops import array_ops 25from tensorflow.python.ops import gen_checkpoint_ops 26from tensorflow.python.ops import init_ops 27from tensorflow.python.ops import math_ops 28 29ops.NotDifferentiable("GenerateVocabRemapping") 30ops.NotDifferentiable("LoadAndRemapMatrix") 31 32 33def _load_and_remap_matrix(ckpt_path, 34 old_tensor_name, 35 new_row_vocab_offset, 36 num_rows_to_load, 37 new_col_vocab_size, 38 initializer, 39 old_row_vocab_size=-1, 40 old_row_vocab_file=None, 41 new_row_vocab_file=None, 42 old_col_vocab_file=None, 43 new_col_vocab_file=None, 44 num_row_oov_buckets=0, 45 num_col_oov_buckets=0, 46 max_rows_in_memory=-1): 47 """Loads a 2-D (matrix) `Tensor` from checkpoint. 48 49 Generates 1D-remappings for rows and columns using the 50 `GenerateVocabRemapping` op, and initializes any anticipated values with the 51 provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a 52 matrix that loads existing values from the checkpoint, while filling out 53 "missing" values with the newly initialized values. See 54 contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped 55 functionality (LoadAndRemapMatrix). This wrapper can be used to perform only 56 row remapping or only col remapping. If only row remapping is desired, 57 {new,old}_col_vocab_file should be `None`, and vice versa for column 58 remapping. 59 60 NOTE: This only supports div-partitioning the vocabulary on the 1st dimension 61 (row axis) via `new_row_vocab_offset`. 62 63 Args: 64 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) 65 from which the old matrix `Tensor` will be loaded. 66 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. 67 new_row_vocab_offset: A 0-indexed integer representing what line to 68 start reading at in the new row vocabulary. Used for partitioned 69 variables. 70 num_rows_to_load: Number of rows to load for the new vocabulary (note: to 71 support variable partitioning and partial loading, this does not need to 72 be the same as the number of entries in `new_row_vocab_file`). 73 new_col_vocab_size: Number of columns to load - should be the same as the 74 number of entries in `new_col_vocab_file`, since we don't support 75 partitioning along the column axis. 76 initializer: Callable initializer function that accepts a 1-D tensor as the 77 arg to specify the shape of the returned tensor. Used to initialize 78 missing values. 79 old_row_vocab_size: The number of entries to consider in the old vocabulary. 80 With the default value of -1, the entire old row vocabulary file will be 81 used. Otherwise, only the first `old_row_vocab_size` entries will be 82 considered for remapping.Must be smaller than the length of 83 `old_row_vocab_file`. NOTE: we do not provide an equivalent 84 `old_col_vocab_size` for classes. 85 old_row_vocab_file: A scalar `Tensor` of type `string` containing the 86 path to the old row vocabulary file. Can be None, which represents no 87 remapping on the row axis. 88 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path 89 to the new row vocabulary file. Can be None, which represents no remapping 90 on the row axis - in which case, `new_row_vocab_offset` and 91 `num_rows_to_load` work under the assumption that the new row vocab is the 92 same as the old row vocab. 93 old_col_vocab_file: A scalar `Tensor` of type `string` containing the 94 path to the old column vocabulary file. Can be None, which represents no 95 remapping on the column axis. 96 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path 97 to the new column vocabulary file. Can be None, which represents no 98 remapping on the column axis - in which case, `new_col_vocab_size` works 99 under the assumption that the new col vocab is the same as the old col 100 vocab. 101 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows 102 to append. Must be >= 0. 103 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary 104 columns to append. Must be >= 0. 105 max_rows_in_memory: `int` specifying the maximum number of rows to load from 106 the checkpoint at once. If less than or equal to 0, the entire matrix will 107 be loaded into memory. Setting this arg trades increased disk reads for 108 lower memory usage. 109 110 Returns: 111 A Tensor of shape `[num_rows_to_load + num_row_oov_buckets, 112 new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the 113 specified tensor in the checkpoint, and any missing or OOV values 114 initialized with the given `initializer`. 115 116 Raises: 117 ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0. 118 ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is 119 provided, while the other is not. Same for `old_col_vocab_file` and 120 `new_col_vocab_file`. 121 ValueError: If neither row vocabs or col vocabs are provided. 122 """ 123 if num_row_oov_buckets < 0: 124 raise ValueError("num_row_oov_buckets must be >= 0, but received %d" % 125 num_row_oov_buckets) 126 if num_col_oov_buckets < 0: 127 raise ValueError("num_col_oov_buckets must be >= 0, but received %d" % 128 num_col_oov_buckets) 129 130 if bool(old_row_vocab_file) != bool(new_row_vocab_file): 131 raise ValueError( 132 "old_row_vocab_file and new_row_vocab_file must both be specified or " 133 "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'". 134 format(old_row_vocab_file, new_row_vocab_file)) 135 if bool(old_col_vocab_file) != bool(new_col_vocab_file): 136 raise ValueError( 137 "old_col_vocab_file and new_col_vocab_file must both be specified or " 138 "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'". 139 format(old_col_vocab_file, new_col_vocab_file)) 140 141 remap_rows = new_row_vocab_file and old_row_vocab_file 142 remap_cols = new_col_vocab_file and old_col_vocab_file 143 if not (remap_rows or remap_cols): 144 raise ValueError( 145 "Must provide either row or column vocab files. If no remapping is " 146 "necessary, consider using `tf.contrib.framework.init_from_checkpoint` " 147 "instead.") 148 149 num_rows_present = num_rows_to_load 150 if remap_rows: 151 row_remapping, num_rows_present = ( 152 gen_checkpoint_ops.generate_vocab_remapping( 153 new_vocab_file=new_row_vocab_file, 154 old_vocab_file=old_row_vocab_file, 155 new_vocab_offset=new_row_vocab_offset, 156 num_new_vocab=num_rows_to_load, 157 old_vocab_size=old_row_vocab_size)) 158 else: 159 # Even when the rows are not being reordered, we still need to generate a 160 # remapping to account for initializing partitioned Variables (when 161 # new_row_vocab_offset is non-zero). 162 row_remapping = math_ops.range( 163 new_row_vocab_offset, 164 new_row_vocab_offset + num_rows_to_load, 165 dtype=dtypes.int64) 166 167 col_remapping = [] 168 num_cols_present = new_col_vocab_size 169 if remap_cols: 170 col_remapping, num_cols_present = ( 171 gen_checkpoint_ops.generate_vocab_remapping( 172 new_vocab_file=new_col_vocab_file, 173 old_vocab_file=old_col_vocab_file, 174 new_vocab_offset=0, # Offset is unused for cols (no partitioning). 175 num_new_vocab=new_col_vocab_size)) 176 177 init_vals = initializer([ 178 num_rows_to_load * new_col_vocab_size - 179 num_rows_present * num_cols_present, 1 180 ]) 181 return_tensor = gen_checkpoint_ops.load_and_remap_matrix( 182 ckpt_path=ckpt_path, 183 old_tensor_name=old_tensor_name, 184 row_remapping=row_remapping, 185 col_remapping=col_remapping, 186 initializing_values=init_vals, 187 num_rows=num_rows_to_load, 188 num_cols=new_col_vocab_size, 189 max_rows_in_memory=max_rows_in_memory) 190 191 # Add OOV row(s) and column(s). 192 if num_row_oov_buckets > 0: 193 init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size]) 194 init_row_oov_val = ops.convert_to_tensor(init_row_oov_val) 195 return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0) 196 if num_col_oov_buckets > 0: 197 # We need to add any row OOV to the new column shape. 198 init_col_oov_val = initializer( 199 [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets]) 200 init_col_oov_val = ops.convert_to_tensor(init_col_oov_val) 201 return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1) 202 203 return return_tensor 204 205 206def _load_and_remap_matrix_initializer(ckpt_path, 207 old_tensor_name, 208 new_row_vocab_size, 209 new_col_vocab_size, 210 old_row_vocab_size=-1, 211 old_row_vocab_file=None, 212 new_row_vocab_file=None, 213 old_col_vocab_file=None, 214 new_col_vocab_file=None, 215 num_row_oov_buckets=0, 216 num_col_oov_buckets=0, 217 initializer=None, 218 max_rows_in_memory=-1): 219 r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor. 220 221 The returned initializer loads a 2-D (matrix) `Tensor` with name 222 `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the 223 rows/columns according to the specified vocab files and append additional 224 out-of-vocabulary rows/columns according to the number of OOV buckets. 225 226 The format of the file at the `{old,new}_{row,col}_vocab_file` path should be 227 a text file, with each line containing a single entity within the vocabulary. 228 Let the function `line_of(f, "x")` return the 0-indexed line number of the 229 entity "x" in file f, and the function `entity_at(f, i)` return the entity at 230 line i of file f. Then, row i of the new output matrix will be taken from row 231 `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old 232 matrix. If any entity in `new_row_vocab_file` is not found in 233 `old_row_vocab_file`, that row is considered a "missing" row, and its values 234 will be initialized using the `initializer` arg. The same logic also applies 235 for the columns. 236 237 For example, assuming that: 238 239 * `old_row_vocab_file` contains "mercury\nvenus\nmars" 240 * `new_row_vocab_file` contains "venus\njupiter\nmercury" 241 * `old_col_vocab_file` contains "good\nbetter\nbest" 242 * `new_col_vocab_file` contains "good\nbest\nfantastic" 243 * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]` 244 * `w(i, j)` represents the value from row i, column j of the old matrix 245 246 Then the new output matrix will look like: 247 248 `[[w(1, 0), w(1, 2), 1], 249 [2, 3, 4], 250 [w(0, 0), w(0, 2), 5]]` 251 252 If we further specify that: 253 254 * `num_row_oov_buckets` == 2 255 * `num_col_oov_buckets` == 1 256 257 Then the new output matrix will look like: 258 259 `[[w(1, 0), w(1, 2), 1, 12], 260 [2, 3, 4, 13], 261 [w(0, 0), w(0, 2), 5, 14], 262 [6, 7, 8, 15], 263 [9, 10, 11, 16]]` 264 265 If `{old,new}_row_vocab_file` are None, we assume that the old and new row 266 vocab files are the same, and no row remapping is done. If 267 `{old,new}_col_vocab_file` are None, we assume that the old and new column 268 vocab files are the same, and no column remapping is done. 269 270 The returned initializer only supports div-partitioning along the row axis. It 271 does not support partitioning along the column axis (as this is not common in 272 practice) or mod-partitioning. 273 274 NOTE: When this is used to warm-start variables, client code should use 275 `tf.lookup.index_table_from_tensor()` like 276 contrib/layers/python/layers/feature_column.py does, as opposed to 277 `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the 278 same. 279 280 Args: 281 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) 282 from which the old matrix `Tensor` will be loaded. 283 old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. 284 new_row_vocab_size: `int` specifying the number of entries in 285 `new_row_vocab_file`. If no row remapping is needed (no row vocab 286 provided), this should be equal to the number of rows to load from the old 287 matrix (which can theoretically be smaller than the number of rows in the 288 old matrix). 289 new_col_vocab_size: `int` specifying the number of entries in 290 `new_col_vocab_file`. If no column remapping is needed (no column vocab 291 provided), this should be equal to the number of columns in the old 292 matrix. 293 old_row_vocab_size: The number of entries to consider in the old vocabulary. 294 With the default value of -1, the entire old row vocabulary file will be 295 used. Otherwise, only the first `old_row_vocab_size` entries will be 296 considered for remapping.Must be smaller than the length of 297 `old_row_vocab_file`. NOTE: we do not provide an equivalent 298 `old_col_vocab_size` for classes. 299 old_row_vocab_file: A scalar `Tensor` of type `string` containing the 300 path to the old row vocabulary file. Can be None, which represents no 301 remapping on the row axis. 302 new_row_vocab_file: A scalar `Tensor` of type `string` containing the path 303 to the new row vocabulary file. Can be None, which represents no remapping 304 on the row axis. 305 old_col_vocab_file: A scalar `Tensor` of type `string` containing the 306 path to the old column vocabulary file. Can be None, which represents no 307 remapping on the column axis. 308 new_col_vocab_file: A scalar `Tensor` of type `string` containing the path 309 to the new column vocabulary file. Can be None, which represents no 310 remapping on the column axis. 311 num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows 312 to append. Must be >= 0. 313 num_col_oov_buckets: `int` specifying the number of out-of-vocabulary 314 columns to append. Must be >= 0. 315 initializer: Initializer function to initialize missing values. Accepts a 316 1-D tensor as the arg to specify the shape of the returned tensor. If 317 `None`, defaults to using `zeros_initializer()`. 318 max_rows_in_memory: `int` specifying the maximum number of rows to load from 319 the checkpoint at once. If less than or equal to 0, the entire matrix will 320 be loaded into memory. Setting this arg trades increased disk reads for 321 lower memory usage. 322 323 Returns: 324 A variable initializer function that should be used to initialize a 325 (potentially partitioned) `Variable` whose complete shape is 326 `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size + 327 num_col_oov_buckets]`. 328 329 Raises: 330 TypeError: If `initializer` is specified but not callable. 331 """ 332 if initializer is None: 333 # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from 334 # Glorot and Bengio, 2010. 335 initializer = init_ops.zeros_initializer() 336 337 if not callable(initializer): 338 raise TypeError( 339 "initializer must be callable, instead of being {} of type {}.".format( 340 initializer, type(initializer))) 341 342 def _initializer(shape, dtype=dtypes.float32, partition_info=None): 343 """Variable initializer. 344 345 Args: 346 shape: Shape of `Tensor` to return. Should include OOV on both axes. 347 dtype: Must be float32. 348 partition_info: variable_scope._PartitionInfo. 349 350 Returns: 351 `Tensor` of shape `shape`. 352 353 Raises: 354 TypeError: If `dtype` is anything other than float32. 355 ValueError: For shape mismatch upon invocation. 356 """ 357 # Sanity checks. 358 if dtype != dtypes.float32: 359 raise TypeError( 360 "Currently, only float32 is supported. Received dtype: {}".format( 361 dtype)) 362 if len(shape) != 2: 363 raise ValueError("Expected 2-dim shape, but received: {}".format(shape)) 364 if shape[0] <= 0: 365 raise ValueError( 366 "Expected 1st dim of shape to be > 0, but received shape: {}".format( 367 shape)) 368 if shape[1] != (new_col_vocab_size + num_col_oov_buckets): 369 raise ValueError( 370 "Expected 2nd dim of shape to be new_col_vocab_size ({}) + " 371 "num_col_oov_buckets ({}) = {}, but received shape: {}".format( 372 new_col_vocab_size, num_col_oov_buckets, 373 new_col_vocab_size + num_col_oov_buckets, shape)) 374 375 offset = 0 376 if partition_info is not None: 377 offset = partition_info.single_offset(shape) 378 379 if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets: 380 raise ValueError( 381 "Trying to initialize {} additional rows after {} rows have already " 382 "been initialized, which would exceed expected total row count of " 383 "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format( 384 shape[0], offset, new_row_vocab_size, num_row_oov_buckets, 385 new_row_vocab_size + num_row_oov_buckets)) 386 387 row_oov_buckets_to_use = min(shape[0], 388 max(0, offset + shape[0] - new_row_vocab_size)) 389 num_rows_to_load = shape[0] - row_oov_buckets_to_use 390 391 # We may be operating on an OOV-only partition, in which case we newly 392 # initialize all rows of this partition. 393 if offset > new_row_vocab_size: 394 if shape[0] != row_oov_buckets_to_use: 395 raise ValueError( 396 "Partitioned variable offset is greater than new vocab size and " 397 "not operating on OOV-only partition.") 398 return initializer(shape) 399 400 return _load_and_remap_matrix( 401 ckpt_path=ckpt_path, 402 old_tensor_name=old_tensor_name, 403 new_row_vocab_offset=offset, 404 num_rows_to_load=num_rows_to_load, 405 new_col_vocab_size=new_col_vocab_size, 406 initializer=initializer, 407 old_row_vocab_size=old_row_vocab_size, 408 old_row_vocab_file=old_row_vocab_file, 409 new_row_vocab_file=new_row_vocab_file, 410 old_col_vocab_file=old_col_vocab_file, 411 new_col_vocab_file=new_col_vocab_file, 412 num_row_oov_buckets=row_oov_buckets_to_use, 413 num_col_oov_buckets=num_col_oov_buckets, 414 max_rows_in_memory=max_rows_in_memory) 415 416 return _initializer 417 418 419def _load_embedding_initializer(ckpt_path, 420 embedding_tensor_name, 421 new_vocab_size, 422 embedding_dim, 423 old_vocab_file, 424 new_vocab_file, 425 old_vocab_size=-1, 426 num_oov_buckets=0, 427 initializer=None, 428 max_rows_in_memory=-1): 429 """Returns a variable initializer for loading pre-trained embeddings. 430 431 Wrapper around `load_and_remap_matrix_initializer()` specialized for loading 432 embedding weights and remapping according to the provided vocab files. See 433 docs for `load_and_remap_matrix_initializer()` for more details. 434 435 NOTE: Only for use with div-partitioned variables / vocabularies. 436 437 Args: 438 ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) 439 from which the old matrix `Tensor` will be loaded. 440 embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. 441 new_vocab_size: Number of entries in the new vocab. 442 embedding_dim: `int` specifying the dimension of the embedding vectors from 443 the checkpoint. Must match the number of columns in the old embedding 444 matrix. 445 old_vocab_file: A scalar `Tensor` of type `string` containing the 446 path to the old vocabulary file. 447 new_vocab_file: A scalar `Tensor` of type `string` containing the 448 path to the new vocabulary file. 449 old_vocab_size: The number of entries to consider in the old vocabulary. 450 With the default value of -1, the entire old row vocabulary file will be 451 used. Otherwise, only the first `old_vocab_size` entries will be 452 considered for remapping.Must be smaller than the length of 453 `old_row_vocab_file`. 454 num_oov_buckets: `int` specifying the number of out-of-vocabulary 455 buckets to use. Must be >= 0. 456 initializer: Initializer function that accepts a 1-D tensor as the arg to 457 specify the shape of the returned tensor. If `None`, defaults to using 458 `truncated_normal_initializer()`. 459 max_rows_in_memory: `int` specifying the maximum number of rows to load from 460 the checkpoint at once. If less than or equal to 0, the entire matrix will 461 be loaded into memory. Setting this arg trades increased disk reads for 462 lower memory usage. 463 464 Returns: 465 A variable initializer function. 466 """ 467 if initializer is None: 468 # TODO(b/25671353): This should be kept in sync with the stddev used by 469 # feature_column.py's _EmbeddingColumn. 470 initializer = init_ops.truncated_normal_initializer( 471 stddev=1.0 / math.sqrt(embedding_dim)) 472 473 return _load_and_remap_matrix_initializer( 474 ckpt_path=ckpt_path, 475 old_tensor_name=embedding_tensor_name, 476 new_row_vocab_size=new_vocab_size, 477 new_col_vocab_size=embedding_dim, 478 old_row_vocab_size=old_vocab_size, 479 old_row_vocab_file=old_vocab_file, 480 new_row_vocab_file=new_vocab_file, 481 old_col_vocab_file=None, 482 new_col_vocab_file=None, 483 num_row_oov_buckets=num_oov_buckets, 484 num_col_oov_buckets=0, 485 initializer=initializer, 486 max_rows_in_memory=max_rows_in_memory) 487