• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Operations for generating and loading vocab remappings."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import math
21
22from tensorflow.python.framework import dtypes
23from tensorflow.python.framework import ops
24from tensorflow.python.ops import array_ops
25from tensorflow.python.ops import gen_checkpoint_ops
26from tensorflow.python.ops import init_ops
27from tensorflow.python.ops import math_ops
28
29ops.NotDifferentiable("GenerateVocabRemapping")
30ops.NotDifferentiable("LoadAndRemapMatrix")
31
32
33def _load_and_remap_matrix(ckpt_path,
34                           old_tensor_name,
35                           new_row_vocab_offset,
36                           num_rows_to_load,
37                           new_col_vocab_size,
38                           initializer,
39                           old_row_vocab_size=-1,
40                           old_row_vocab_file=None,
41                           new_row_vocab_file=None,
42                           old_col_vocab_file=None,
43                           new_col_vocab_file=None,
44                           num_row_oov_buckets=0,
45                           num_col_oov_buckets=0,
46                           max_rows_in_memory=-1):
47  """Loads a 2-D (matrix) `Tensor` from checkpoint.
48
49  Generates 1D-remappings for rows and columns using the
50  `GenerateVocabRemapping` op, and initializes any anticipated values with the
51  provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
52  matrix that loads existing values from the checkpoint, while filling out
53  "missing" values with the newly initialized values. See
54  contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
55  functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
56  row remapping or only col remapping. If only row remapping is desired,
57  {new,old}_col_vocab_file should be `None`, and vice versa for column
58  remapping.
59
60  NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
61  (row axis) via `new_row_vocab_offset`.
62
63  Args:
64    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
65      from which the old matrix `Tensor` will be loaded.
66    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
67    new_row_vocab_offset: A 0-indexed integer representing what line to
68      start reading at in the new row vocabulary. Used for partitioned
69      variables.
70    num_rows_to_load: Number of rows to load for the new vocabulary (note: to
71      support variable partitioning and partial loading, this does not need to
72      be the same as the number of entries in `new_row_vocab_file`).
73    new_col_vocab_size: Number of columns to load - should be the same as the
74      number of entries in `new_col_vocab_file`, since we don't support
75      partitioning along the column axis.
76    initializer: Callable initializer function that accepts a 1-D tensor as the
77      arg to specify the shape of the returned tensor. Used to initialize
78      missing values.
79    old_row_vocab_size: The number of entries to consider in the old vocabulary.
80      With the default value of -1, the entire old row vocabulary file will be
81      used.  Otherwise, only the first `old_row_vocab_size` entries will be
82      considered for remapping.Must be smaller than the length of
83      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
84      `old_col_vocab_size` for classes.
85    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
86      path to the old row vocabulary file. Can be None, which represents no
87      remapping on the row axis.
88    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
89      to the new row vocabulary file. Can be None, which represents no remapping
90      on the row axis - in which case, `new_row_vocab_offset` and
91      `num_rows_to_load` work under the assumption that the new row vocab is the
92      same as the old row vocab.
93    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
94      path to the old column vocabulary file. Can be None, which represents no
95      remapping on the column axis.
96    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
97      to the new column vocabulary file. Can be None, which represents no
98      remapping on the column axis - in which case, `new_col_vocab_size` works
99      under the assumption that the new col vocab is the same as the old col
100      vocab.
101    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
102      to append. Must be >= 0.
103    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
104      columns to append. Must be >= 0.
105    max_rows_in_memory: `int` specifying the maximum number of rows to load from
106      the checkpoint at once. If less than or equal to 0, the entire matrix will
107      be loaded into memory. Setting this arg trades increased disk reads for
108      lower memory usage.
109
110  Returns:
111    A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
112    new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
113    specified tensor in the checkpoint, and any missing or OOV values
114    initialized with the given `initializer`.
115
116  Raises:
117    ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
118    ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
119      provided, while the other is not. Same for `old_col_vocab_file` and
120      `new_col_vocab_file`.
121    ValueError: If neither row vocabs or col vocabs are provided.
122  """
123  if num_row_oov_buckets < 0:
124    raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
125                     num_row_oov_buckets)
126  if num_col_oov_buckets < 0:
127    raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
128                     num_col_oov_buckets)
129
130  if bool(old_row_vocab_file) != bool(new_row_vocab_file):
131    raise ValueError(
132        "old_row_vocab_file and new_row_vocab_file must both be specified or "
133        "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
134        format(old_row_vocab_file, new_row_vocab_file))
135  if bool(old_col_vocab_file) != bool(new_col_vocab_file):
136    raise ValueError(
137        "old_col_vocab_file and new_col_vocab_file must both be specified or "
138        "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
139        format(old_col_vocab_file, new_col_vocab_file))
140
141  remap_rows = new_row_vocab_file and old_row_vocab_file
142  remap_cols = new_col_vocab_file and old_col_vocab_file
143  if not (remap_rows or remap_cols):
144    raise ValueError(
145        "Must provide either row or column vocab files. If no remapping is "
146        "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
147        "instead.")
148
149  num_rows_present = num_rows_to_load
150  if remap_rows:
151    row_remapping, num_rows_present = (
152        gen_checkpoint_ops.generate_vocab_remapping(
153            new_vocab_file=new_row_vocab_file,
154            old_vocab_file=old_row_vocab_file,
155            new_vocab_offset=new_row_vocab_offset,
156            num_new_vocab=num_rows_to_load,
157            old_vocab_size=old_row_vocab_size))
158  else:
159    # Even when the rows are not being reordered, we still need to generate a
160    # remapping to account for initializing partitioned Variables (when
161    # new_row_vocab_offset is non-zero).
162    row_remapping = math_ops.range(
163        new_row_vocab_offset,
164        new_row_vocab_offset + num_rows_to_load,
165        dtype=dtypes.int64)
166
167  col_remapping = []
168  num_cols_present = new_col_vocab_size
169  if remap_cols:
170    col_remapping, num_cols_present = (
171        gen_checkpoint_ops.generate_vocab_remapping(
172            new_vocab_file=new_col_vocab_file,
173            old_vocab_file=old_col_vocab_file,
174            new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
175            num_new_vocab=new_col_vocab_size))
176
177  init_vals = initializer([
178      num_rows_to_load * new_col_vocab_size -
179      num_rows_present * num_cols_present, 1
180  ])
181  return_tensor = gen_checkpoint_ops.load_and_remap_matrix(
182      ckpt_path=ckpt_path,
183      old_tensor_name=old_tensor_name,
184      row_remapping=row_remapping,
185      col_remapping=col_remapping,
186      initializing_values=init_vals,
187      num_rows=num_rows_to_load,
188      num_cols=new_col_vocab_size,
189      max_rows_in_memory=max_rows_in_memory)
190
191  # Add OOV row(s) and column(s).
192  if num_row_oov_buckets > 0:
193    init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
194    init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
195    return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
196  if num_col_oov_buckets > 0:
197    # We need to add any row OOV to the new column shape.
198    init_col_oov_val = initializer(
199        [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
200    init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
201    return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
202
203  return return_tensor
204
205
206def _load_and_remap_matrix_initializer(ckpt_path,
207                                       old_tensor_name,
208                                       new_row_vocab_size,
209                                       new_col_vocab_size,
210                                       old_row_vocab_size=-1,
211                                       old_row_vocab_file=None,
212                                       new_row_vocab_file=None,
213                                       old_col_vocab_file=None,
214                                       new_col_vocab_file=None,
215                                       num_row_oov_buckets=0,
216                                       num_col_oov_buckets=0,
217                                       initializer=None,
218                                       max_rows_in_memory=-1):
219  r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
220
221  The returned initializer loads a 2-D (matrix) `Tensor` with name
222  `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
223  rows/columns according to the specified vocab files and append additional
224  out-of-vocabulary rows/columns according to the number of OOV buckets.
225
226  The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
227  a text file, with each line containing a single entity within the vocabulary.
228  Let the function `line_of(f, "x")` return the 0-indexed line number of the
229  entity "x" in file f, and the function `entity_at(f, i)` return the entity at
230  line i of file f. Then, row i of the new output matrix will be taken from row
231  `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
232  matrix. If any entity in `new_row_vocab_file` is not found in
233  `old_row_vocab_file`, that row is considered a "missing" row, and its values
234  will be initialized using the `initializer` arg. The same logic also applies
235  for the columns.
236
237  For example, assuming that:
238
239  * `old_row_vocab_file` contains "mercury\nvenus\nmars"
240  * `new_row_vocab_file` contains "venus\njupiter\nmercury"
241  * `old_col_vocab_file` contains "good\nbetter\nbest"
242  * `new_col_vocab_file` contains "good\nbest\nfantastic"
243  * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
244  * `w(i, j)` represents the value from row i, column j of the old matrix
245
246  Then the new output matrix will look like:
247
248  `[[w(1, 0), w(1, 2), 1],
249    [2,       3,       4],
250    [w(0, 0), w(0, 2), 5]]`
251
252  If we further specify that:
253
254  * `num_row_oov_buckets` == 2
255  * `num_col_oov_buckets` == 1
256
257  Then the new output matrix will look like:
258
259  `[[w(1, 0), w(1, 2), 1,  12],
260    [2,       3,       4,  13],
261    [w(0, 0), w(0, 2), 5,  14],
262    [6,       7,       8,  15],
263    [9,       10,      11, 16]]`
264
265  If `{old,new}_row_vocab_file` are None, we assume that the old and new row
266  vocab files are the same, and no row remapping is done. If
267  `{old,new}_col_vocab_file` are None, we assume that the old and new column
268  vocab files are the same, and no column remapping is done.
269
270  The returned initializer only supports div-partitioning along the row axis. It
271  does not support partitioning along the column axis (as this is not common in
272  practice) or mod-partitioning.
273
274  NOTE: When this is used to warm-start variables, client code should use
275  `tf.lookup.index_table_from_tensor()` like
276  contrib/layers/python/layers/feature_column.py does, as opposed to
277  `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
278  same.
279
280  Args:
281    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
282      from which the old matrix `Tensor` will be loaded.
283    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
284    new_row_vocab_size: `int` specifying the number of entries in
285      `new_row_vocab_file`. If no row remapping is needed (no row vocab
286      provided), this should be equal to the number of rows to load from the old
287      matrix (which can theoretically be smaller than the number of rows in the
288      old matrix).
289    new_col_vocab_size: `int` specifying the number of entries in
290      `new_col_vocab_file`. If no column remapping is needed (no column vocab
291      provided), this should be equal to the number of columns in the old
292      matrix.
293    old_row_vocab_size: The number of entries to consider in the old vocabulary.
294      With the default value of -1, the entire old row vocabulary file will be
295      used.  Otherwise, only the first `old_row_vocab_size` entries will be
296      considered for remapping.Must be smaller than the length of
297      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
298      `old_col_vocab_size` for classes.
299    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
300      path to the old row vocabulary file. Can be None, which represents no
301      remapping on the row axis.
302    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
303      to the new row vocabulary file. Can be None, which represents no remapping
304      on the row axis.
305    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
306      path to the old column vocabulary file. Can be None, which represents no
307      remapping on the column axis.
308    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
309      to the new column vocabulary file. Can be None, which represents no
310      remapping on the column axis.
311    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
312      to append. Must be >= 0.
313    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
314      columns to append. Must be >= 0.
315    initializer: Initializer function to initialize missing values. Accepts a
316      1-D tensor as the arg to specify the shape of the returned tensor. If
317      `None`, defaults to using `zeros_initializer()`.
318    max_rows_in_memory: `int` specifying the maximum number of rows to load from
319      the checkpoint at once. If less than or equal to 0, the entire matrix will
320      be loaded into memory. Setting this arg trades increased disk reads for
321      lower memory usage.
322
323  Returns:
324    A variable initializer function that should be used to initialize a
325    (potentially partitioned) `Variable` whose complete shape is
326    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
327    num_col_oov_buckets]`.
328
329  Raises:
330    TypeError: If `initializer` is specified but not callable.
331  """
332  if initializer is None:
333    # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
334    # Glorot and Bengio, 2010.
335    initializer = init_ops.zeros_initializer()
336
337  if not callable(initializer):
338    raise TypeError(
339        "initializer must be callable, instead of being {} of type {}.".format(
340            initializer, type(initializer)))
341
342  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
343    """Variable initializer.
344
345    Args:
346      shape: Shape of `Tensor` to return. Should include OOV on both axes.
347      dtype: Must be float32.
348      partition_info: variable_scope._PartitionInfo.
349
350    Returns:
351      `Tensor` of shape `shape`.
352
353    Raises:
354      TypeError: If `dtype` is anything other than float32.
355      ValueError: For shape mismatch upon invocation.
356    """
357    # Sanity checks.
358    if dtype != dtypes.float32:
359      raise TypeError(
360          "Currently, only float32 is supported. Received dtype: {}".format(
361              dtype))
362    if len(shape) != 2:
363      raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
364    if shape[0] <= 0:
365      raise ValueError(
366          "Expected 1st dim of shape to be > 0, but received shape: {}".format(
367              shape))
368    if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
369      raise ValueError(
370          "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
371          "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
372              new_col_vocab_size, num_col_oov_buckets,
373              new_col_vocab_size + num_col_oov_buckets, shape))
374
375    offset = 0
376    if partition_info is not None:
377      offset = partition_info.single_offset(shape)
378
379    if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
380      raise ValueError(
381          "Trying to initialize {} additional rows after {} rows have already "
382          "been initialized, which would exceed expected total row count of "
383          "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
384              shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
385              new_row_vocab_size + num_row_oov_buckets))
386
387    row_oov_buckets_to_use = min(shape[0],
388                                 max(0, offset + shape[0] - new_row_vocab_size))
389    num_rows_to_load = shape[0] - row_oov_buckets_to_use
390
391    # We may be operating on an OOV-only partition, in which case we newly
392    # initialize all rows of this partition.
393    if offset > new_row_vocab_size:
394      if shape[0] != row_oov_buckets_to_use:
395        raise ValueError(
396            "Partitioned variable offset is greater than new vocab size and "
397            "not operating on OOV-only partition.")
398      return initializer(shape)
399
400    return _load_and_remap_matrix(
401        ckpt_path=ckpt_path,
402        old_tensor_name=old_tensor_name,
403        new_row_vocab_offset=offset,
404        num_rows_to_load=num_rows_to_load,
405        new_col_vocab_size=new_col_vocab_size,
406        initializer=initializer,
407        old_row_vocab_size=old_row_vocab_size,
408        old_row_vocab_file=old_row_vocab_file,
409        new_row_vocab_file=new_row_vocab_file,
410        old_col_vocab_file=old_col_vocab_file,
411        new_col_vocab_file=new_col_vocab_file,
412        num_row_oov_buckets=row_oov_buckets_to_use,
413        num_col_oov_buckets=num_col_oov_buckets,
414        max_rows_in_memory=max_rows_in_memory)
415
416  return _initializer
417
418
419def _load_embedding_initializer(ckpt_path,
420                                embedding_tensor_name,
421                                new_vocab_size,
422                                embedding_dim,
423                                old_vocab_file,
424                                new_vocab_file,
425                                old_vocab_size=-1,
426                                num_oov_buckets=0,
427                                initializer=None,
428                                max_rows_in_memory=-1):
429  """Returns a variable initializer for loading pre-trained embeddings.
430
431  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
432  embedding weights and remapping according to the provided vocab files. See
433  docs for `load_and_remap_matrix_initializer()` for more details.
434
435  NOTE: Only for use with div-partitioned variables / vocabularies.
436
437  Args:
438    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
439      from which the old matrix `Tensor` will be loaded.
440    embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
441    new_vocab_size: Number of entries in the new vocab.
442    embedding_dim: `int` specifying the dimension of the embedding vectors from
443      the checkpoint. Must match the number of columns in the old embedding
444      matrix.
445    old_vocab_file: A scalar `Tensor` of type `string` containing the
446      path to the old vocabulary file.
447    new_vocab_file: A scalar `Tensor` of type `string` containing the
448      path to the new vocabulary file.
449    old_vocab_size: The number of entries to consider in the old vocabulary.
450      With the default value of -1, the entire old row vocabulary file will be
451      used.  Otherwise, only the first `old_vocab_size` entries will be
452      considered for remapping.Must be smaller than the length of
453      `old_row_vocab_file`.
454    num_oov_buckets: `int` specifying the number of out-of-vocabulary
455      buckets to use. Must be >= 0.
456    initializer: Initializer function that accepts a 1-D tensor as the arg to
457      specify the shape of the returned tensor. If `None`, defaults to using
458      `truncated_normal_initializer()`.
459    max_rows_in_memory: `int` specifying the maximum number of rows to load from
460      the checkpoint at once. If less than or equal to 0, the entire matrix will
461      be loaded into memory. Setting this arg trades increased disk reads for
462      lower memory usage.
463
464  Returns:
465    A variable initializer function.
466  """
467  if initializer is None:
468    # TODO(b/25671353): This should be kept in sync with the stddev used by
469    # feature_column.py's _EmbeddingColumn.
470    initializer = init_ops.truncated_normal_initializer(
471        stddev=1.0 / math.sqrt(embedding_dim))
472
473  return _load_and_remap_matrix_initializer(
474      ckpt_path=ckpt_path,
475      old_tensor_name=embedding_tensor_name,
476      new_row_vocab_size=new_vocab_size,
477      new_col_vocab_size=embedding_dim,
478      old_row_vocab_size=old_vocab_size,
479      old_row_vocab_file=old_vocab_file,
480      new_row_vocab_file=new_vocab_file,
481      old_col_vocab_file=None,
482      new_col_vocab_file=None,
483      num_row_oov_buckets=num_oov_buckets,
484      num_col_oov_buckets=0,
485      initializer=initializer,
486      max_rows_in_memory=max_rows_in_memory)
487