• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Operations for generating and loading vocab remappings."""
16import math
17
18from tensorflow.python.framework import dtypes
19from tensorflow.python.framework import ops
20from tensorflow.python.ops import array_ops
21from tensorflow.python.ops import gen_checkpoint_ops
22from tensorflow.python.ops import init_ops
23from tensorflow.python.ops import math_ops
24
25ops.NotDifferentiable("GenerateVocabRemapping")
26ops.NotDifferentiable("LoadAndRemapMatrix")
27
28
29def _load_and_remap_matrix(ckpt_path,
30                           old_tensor_name,
31                           new_row_vocab_offset,
32                           num_rows_to_load,
33                           new_col_vocab_size,
34                           initializer,
35                           old_row_vocab_size=-1,
36                           old_row_vocab_file=None,
37                           new_row_vocab_file=None,
38                           old_col_vocab_file=None,
39                           new_col_vocab_file=None,
40                           num_row_oov_buckets=0,
41                           num_col_oov_buckets=0,
42                           max_rows_in_memory=-1):
43  """Loads a 2-D (matrix) `Tensor` from checkpoint.
44
45  Generates 1D-remappings for rows and columns using the
46  `GenerateVocabRemapping` op, and initializes any anticipated values with the
47  provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
48  matrix that loads existing values from the checkpoint, while filling out
49  "missing" values with the newly initialized values. See
50  contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
51  functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
52  row remapping or only col remapping. If only row remapping is desired,
53  {new,old}_col_vocab_file should be `None`, and vice versa for column
54  remapping.
55
56  NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
57  (row axis) via `new_row_vocab_offset`.
58
59  Args:
60    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
61      from which the old matrix `Tensor` will be loaded.
62    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
63    new_row_vocab_offset: A 0-indexed integer representing what line to
64      start reading at in the new row vocabulary. Used for partitioned
65      variables.
66    num_rows_to_load: Number of rows to load for the new vocabulary (note: to
67      support variable partitioning and partial loading, this does not need to
68      be the same as the number of entries in `new_row_vocab_file`).
69    new_col_vocab_size: Number of columns to load - should be the same as the
70      number of entries in `new_col_vocab_file`, since we don't support
71      partitioning along the column axis.
72    initializer: Callable initializer function that accepts a 1-D tensor as the
73      arg to specify the shape of the returned tensor. Used to initialize
74      missing values.
75    old_row_vocab_size: The number of entries to consider in the old vocabulary.
76      With the default value of -1, the entire old row vocabulary file will be
77      used.  Otherwise, only the first `old_row_vocab_size` entries will be
78      considered for remapping.Must be smaller than the length of
79      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
80      `old_col_vocab_size` for classes.
81    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
82      path to the old row vocabulary file. Can be None, which represents no
83      remapping on the row axis.
84    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
85      to the new row vocabulary file. Can be None, which represents no remapping
86      on the row axis - in which case, `new_row_vocab_offset` and
87      `num_rows_to_load` work under the assumption that the new row vocab is the
88      same as the old row vocab.
89    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
90      path to the old column vocabulary file. Can be None, which represents no
91      remapping on the column axis.
92    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
93      to the new column vocabulary file. Can be None, which represents no
94      remapping on the column axis - in which case, `new_col_vocab_size` works
95      under the assumption that the new col vocab is the same as the old col
96      vocab.
97    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
98      to append. Must be >= 0.
99    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
100      columns to append. Must be >= 0.
101    max_rows_in_memory: `int` specifying the maximum number of rows to load from
102      the checkpoint at once. If less than or equal to 0, the entire matrix will
103      be loaded into memory. Setting this arg trades increased disk reads for
104      lower memory usage.
105
106  Returns:
107    A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
108    new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
109    specified tensor in the checkpoint, and any missing or OOV values
110    initialized with the given `initializer`.
111
112  Raises:
113    ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
114    ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
115      provided, while the other is not. Same for `old_col_vocab_file` and
116      `new_col_vocab_file`.
117    ValueError: If neither row vocabs or col vocabs are provided.
118  """
119  if num_row_oov_buckets < 0:
120    raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
121                     num_row_oov_buckets)
122  if num_col_oov_buckets < 0:
123    raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
124                     num_col_oov_buckets)
125
126  if bool(old_row_vocab_file) != bool(new_row_vocab_file):
127    raise ValueError(
128        "old_row_vocab_file and new_row_vocab_file must both be specified or "
129        "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
130        format(old_row_vocab_file, new_row_vocab_file))
131  if bool(old_col_vocab_file) != bool(new_col_vocab_file):
132    raise ValueError(
133        "old_col_vocab_file and new_col_vocab_file must both be specified or "
134        "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
135        format(old_col_vocab_file, new_col_vocab_file))
136
137  remap_rows = new_row_vocab_file and old_row_vocab_file
138  remap_cols = new_col_vocab_file and old_col_vocab_file
139  if not (remap_rows or remap_cols):
140    raise ValueError(
141        "Must provide either row or column vocab files. If no remapping is "
142        "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
143        "instead.")
144
145  num_rows_present = num_rows_to_load
146  if remap_rows:
147    row_remapping, num_rows_present = (
148        gen_checkpoint_ops.generate_vocab_remapping(
149            new_vocab_file=new_row_vocab_file,
150            old_vocab_file=old_row_vocab_file,
151            new_vocab_offset=new_row_vocab_offset,
152            num_new_vocab=num_rows_to_load,
153            old_vocab_size=old_row_vocab_size))
154  else:
155    # Even when the rows are not being reordered, we still need to generate a
156    # remapping to account for initializing partitioned Variables (when
157    # new_row_vocab_offset is non-zero).
158    row_remapping = math_ops.range(
159        new_row_vocab_offset,
160        new_row_vocab_offset + num_rows_to_load,
161        dtype=dtypes.int64)
162
163  col_remapping = []
164  num_cols_present = new_col_vocab_size
165  if remap_cols:
166    col_remapping, num_cols_present = (
167        gen_checkpoint_ops.generate_vocab_remapping(
168            new_vocab_file=new_col_vocab_file,
169            old_vocab_file=old_col_vocab_file,
170            new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
171            num_new_vocab=new_col_vocab_size))
172
173  init_vals = initializer([
174      num_rows_to_load * new_col_vocab_size -
175      num_rows_present * num_cols_present, 1
176  ])
177  return_tensor = gen_checkpoint_ops.load_and_remap_matrix(
178      ckpt_path=ckpt_path,
179      old_tensor_name=old_tensor_name,
180      row_remapping=row_remapping,
181      col_remapping=col_remapping,
182      initializing_values=init_vals,
183      num_rows=num_rows_to_load,
184      num_cols=new_col_vocab_size,
185      max_rows_in_memory=max_rows_in_memory)
186
187  # Add OOV row(s) and column(s).
188  if num_row_oov_buckets > 0:
189    init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
190    init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
191    return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
192  if num_col_oov_buckets > 0:
193    # We need to add any row OOV to the new column shape.
194    init_col_oov_val = initializer(
195        [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
196    init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
197    return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
198
199  return return_tensor
200
201
202def _load_and_remap_matrix_initializer(ckpt_path,
203                                       old_tensor_name,
204                                       new_row_vocab_size,
205                                       new_col_vocab_size,
206                                       old_row_vocab_size=-1,
207                                       old_row_vocab_file=None,
208                                       new_row_vocab_file=None,
209                                       old_col_vocab_file=None,
210                                       new_col_vocab_file=None,
211                                       num_row_oov_buckets=0,
212                                       num_col_oov_buckets=0,
213                                       initializer=None,
214                                       max_rows_in_memory=-1):
215  r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
216
217  The returned initializer loads a 2-D (matrix) `Tensor` with name
218  `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
219  rows/columns according to the specified vocab files and append additional
220  out-of-vocabulary rows/columns according to the number of OOV buckets.
221
222  The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
223  a text file, with each line containing a single entity within the vocabulary.
224  Let the function `line_of(f, "x")` return the 0-indexed line number of the
225  entity "x" in file f, and the function `entity_at(f, i)` return the entity at
226  line i of file f. Then, row i of the new output matrix will be taken from row
227  `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
228  matrix. If any entity in `new_row_vocab_file` is not found in
229  `old_row_vocab_file`, that row is considered a "missing" row, and its values
230  will be initialized using the `initializer` arg. The same logic also applies
231  for the columns.
232
233  For example, assuming that:
234
235  * `old_row_vocab_file` contains "mercury\nvenus\nmars"
236  * `new_row_vocab_file` contains "venus\njupiter\nmercury"
237  * `old_col_vocab_file` contains "good\nbetter\nbest"
238  * `new_col_vocab_file` contains "good\nbest\nfantastic"
239  * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
240  * `w(i, j)` represents the value from row i, column j of the old matrix
241
242  Then the new output matrix will look like:
243
244  `[[w(1, 0), w(1, 2), 1],
245    [2,       3,       4],
246    [w(0, 0), w(0, 2), 5]]`
247
248  If we further specify that:
249
250  * `num_row_oov_buckets` == 2
251  * `num_col_oov_buckets` == 1
252
253  Then the new output matrix will look like:
254
255  `[[w(1, 0), w(1, 2), 1,  12],
256    [2,       3,       4,  13],
257    [w(0, 0), w(0, 2), 5,  14],
258    [6,       7,       8,  15],
259    [9,       10,      11, 16]]`
260
261  If `{old,new}_row_vocab_file` are None, we assume that the old and new row
262  vocab files are the same, and no row remapping is done. If
263  `{old,new}_col_vocab_file` are None, we assume that the old and new column
264  vocab files are the same, and no column remapping is done.
265
266  The returned initializer only supports div-partitioning along the row axis. It
267  does not support partitioning along the column axis (as this is not common in
268  practice) or mod-partitioning.
269
270  NOTE: When this is used to warm-start variables, client code should use
271  `tf.lookup.index_table_from_tensor()` like
272  contrib/layers/python/layers/feature_column.py does, as opposed to
273  `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
274  same.
275
276  Args:
277    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
278      from which the old matrix `Tensor` will be loaded.
279    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
280    new_row_vocab_size: `int` specifying the number of entries in
281      `new_row_vocab_file`. If no row remapping is needed (no row vocab
282      provided), this should be equal to the number of rows to load from the old
283      matrix (which can theoretically be smaller than the number of rows in the
284      old matrix).
285    new_col_vocab_size: `int` specifying the number of entries in
286      `new_col_vocab_file`. If no column remapping is needed (no column vocab
287      provided), this should be equal to the number of columns in the old
288      matrix.
289    old_row_vocab_size: The number of entries to consider in the old vocabulary.
290      With the default value of -1, the entire old row vocabulary file will be
291      used.  Otherwise, only the first `old_row_vocab_size` entries will be
292      considered for remapping.Must be smaller than the length of
293      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
294      `old_col_vocab_size` for classes.
295    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
296      path to the old row vocabulary file. Can be None, which represents no
297      remapping on the row axis.
298    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
299      to the new row vocabulary file. Can be None, which represents no remapping
300      on the row axis.
301    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
302      path to the old column vocabulary file. Can be None, which represents no
303      remapping on the column axis.
304    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
305      to the new column vocabulary file. Can be None, which represents no
306      remapping on the column axis.
307    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
308      to append. Must be >= 0.
309    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
310      columns to append. Must be >= 0.
311    initializer: Initializer function to initialize missing values. Accepts a
312      1-D tensor as the arg to specify the shape of the returned tensor. If
313      `None`, defaults to using `zeros_initializer()`.
314    max_rows_in_memory: `int` specifying the maximum number of rows to load from
315      the checkpoint at once. If less than or equal to 0, the entire matrix will
316      be loaded into memory. Setting this arg trades increased disk reads for
317      lower memory usage.
318
319  Returns:
320    A variable initializer function that should be used to initialize a
321    (potentially partitioned) `Variable` whose complete shape is
322    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
323    num_col_oov_buckets]`.
324
325  Raises:
326    TypeError: If `initializer` is specified but not callable.
327  """
328  if initializer is None:
329    # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
330    # Glorot and Bengio, 2010.
331    initializer = init_ops.zeros_initializer()
332
333  if not callable(initializer):
334    raise TypeError(
335        "initializer must be callable, instead of being {} of type {}.".format(
336            initializer, type(initializer)))
337
338  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
339    """Variable initializer.
340
341    Args:
342      shape: Shape of `Tensor` to return. Should include OOV on both axes.
343      dtype: Must be float32.
344      partition_info: variable_scope._PartitionInfo.
345
346    Returns:
347      `Tensor` of shape `shape`.
348
349    Raises:
350      TypeError: If `dtype` is anything other than float32.
351      ValueError: For shape mismatch upon invocation.
352    """
353    # Sanity checks.
354    if dtype != dtypes.float32:
355      raise TypeError(
356          "Currently, only float32 is supported. Received dtype: {}".format(
357              dtype))
358    if len(shape) != 2:
359      raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
360    if shape[0] <= 0:
361      raise ValueError(
362          "Expected 1st dim of shape to be > 0, but received shape: {}".format(
363              shape))
364    if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
365      raise ValueError(
366          "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
367          "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
368              new_col_vocab_size, num_col_oov_buckets,
369              new_col_vocab_size + num_col_oov_buckets, shape))
370
371    offset = 0
372    if partition_info is not None:
373      offset = partition_info.single_offset(shape)
374
375    if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
376      raise ValueError(
377          "Trying to initialize {} additional rows after {} rows have already "
378          "been initialized, which would exceed expected total row count of "
379          "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
380              shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
381              new_row_vocab_size + num_row_oov_buckets))
382
383    row_oov_buckets_to_use = min(shape[0],
384                                 max(0, offset + shape[0] - new_row_vocab_size))
385    num_rows_to_load = shape[0] - row_oov_buckets_to_use
386
387    # We may be operating on an OOV-only partition, in which case we newly
388    # initialize all rows of this partition.
389    if offset > new_row_vocab_size:
390      if shape[0] != row_oov_buckets_to_use:
391        raise ValueError(
392            "Partitioned variable offset is greater than new vocab size and "
393            "not operating on OOV-only partition.")
394      return initializer(shape)
395
396    return _load_and_remap_matrix(
397        ckpt_path=ckpt_path,
398        old_tensor_name=old_tensor_name,
399        new_row_vocab_offset=offset,
400        num_rows_to_load=num_rows_to_load,
401        new_col_vocab_size=new_col_vocab_size,
402        initializer=initializer,
403        old_row_vocab_size=old_row_vocab_size,
404        old_row_vocab_file=old_row_vocab_file,
405        new_row_vocab_file=new_row_vocab_file,
406        old_col_vocab_file=old_col_vocab_file,
407        new_col_vocab_file=new_col_vocab_file,
408        num_row_oov_buckets=row_oov_buckets_to_use,
409        num_col_oov_buckets=num_col_oov_buckets,
410        max_rows_in_memory=max_rows_in_memory)
411
412  return _initializer
413
414
415def _load_embedding_initializer(ckpt_path,
416                                embedding_tensor_name,
417                                new_vocab_size,
418                                embedding_dim,
419                                old_vocab_file,
420                                new_vocab_file,
421                                old_vocab_size=-1,
422                                num_oov_buckets=0,
423                                initializer=None,
424                                max_rows_in_memory=-1):
425  """Returns a variable initializer for loading pre-trained embeddings.
426
427  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
428  embedding weights and remapping according to the provided vocab files. See
429  docs for `load_and_remap_matrix_initializer()` for more details.
430
431  NOTE: Only for use with div-partitioned variables / vocabularies.
432
433  Args:
434    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
435      from which the old matrix `Tensor` will be loaded.
436    embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
437    new_vocab_size: Number of entries in the new vocab.
438    embedding_dim: `int` specifying the dimension of the embedding vectors from
439      the checkpoint. Must match the number of columns in the old embedding
440      matrix.
441    old_vocab_file: A scalar `Tensor` of type `string` containing the
442      path to the old vocabulary file.
443    new_vocab_file: A scalar `Tensor` of type `string` containing the
444      path to the new vocabulary file.
445    old_vocab_size: The number of entries to consider in the old vocabulary.
446      With the default value of -1, the entire old row vocabulary file will be
447      used.  Otherwise, only the first `old_vocab_size` entries will be
448      considered for remapping.Must be smaller than the length of
449      `old_row_vocab_file`.
450    num_oov_buckets: `int` specifying the number of out-of-vocabulary
451      buckets to use. Must be >= 0.
452    initializer: Initializer function that accepts a 1-D tensor as the arg to
453      specify the shape of the returned tensor. If `None`, defaults to using
454      `truncated_normal_initializer()`.
455    max_rows_in_memory: `int` specifying the maximum number of rows to load from
456      the checkpoint at once. If less than or equal to 0, the entire matrix will
457      be loaded into memory. Setting this arg trades increased disk reads for
458      lower memory usage.
459
460  Returns:
461    A variable initializer function.
462  """
463  if initializer is None:
464    # TODO(b/25671353): This should be kept in sync with the stddev used by
465    # feature_column.py's _EmbeddingColumn.
466    initializer = init_ops.truncated_normal_initializer(
467        stddev=1.0 / math.sqrt(embedding_dim))
468
469  return _load_and_remap_matrix_initializer(
470      ckpt_path=ckpt_path,
471      old_tensor_name=embedding_tensor_name,
472      new_row_vocab_size=new_vocab_size,
473      new_col_vocab_size=embedding_dim,
474      old_row_vocab_size=old_vocab_size,
475      old_row_vocab_file=old_vocab_file,
476      new_row_vocab_file=new_vocab_file,
477      old_col_vocab_file=None,
478      new_col_vocab_file=None,
479      num_row_oov_buckets=num_oov_buckets,
480      num_col_oov_buckets=0,
481      initializer=initializer,
482      max_rows_in_memory=max_rows_in_memory)
483