• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Ops for hybrid model training."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import threading
21
22from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops
23from tensorflow.contrib.util import loader
24from tensorflow.python.framework import ops
25from tensorflow.python.ops import array_ops
26from tensorflow.python.ops import math_ops
27from tensorflow.python.platform import resource_loader
28from tensorflow.python.platform import tf_logging as logging
29
30TRAINING_OPS_FILE = '_training_ops.so'
31
32_training_ops = None
33_ops_lock = threading.Lock()
34
35# TODO(b/31222613): Some of these ops are probably differentiable, and
36# there may be latent bugs here.
37ops.NotDifferentiable('HardRoutingFunction')
38ops.NotDifferentiable('RoutingGradient')
39ops.NotDifferentiable('KFeatureDataGradient')
40ops.NotDifferentiable('KFeatureRoutingGradient')
41ops.NotDifferentiable('KFeatureWeightGradient')
42ops.NotDifferentiable('UnpackPath')
43
44
45@ops.RegisterGradient('RoutingFunction')
46def _RoutingFunctionGradient(op, grad):
47  """The gradient of RoutingFunction.
48
49  Args:
50    op: The RoutingFunction op.
51    grad: Gradient with respect to the output of the RoutingFunction op.
52
53  Returns:
54    Gradients with respect to the input of the RoutingFunction op.
55  """
56  routing_gradient = gen_training_ops.routing_gradient
57
58  input_data_tensor = op.inputs[0]
59  tree_weights_tensor = op.inputs[1]
60  tree_thresholds_tensor = op.inputs[2]
61
62  routing_function_tensor = op.outputs[0]
63
64  # The derivatives below are each defined over one or two of three dimensions:
65  # (batch_size, num_nodes, num_features).  We explicitly expand each derivative
66  # to three dimensions to ensure that they're broadcasted correctly.
67
68  # dl / du is the derivative of the loss with respect to the output of the
69  # routing function, which is provided by tensorflow.
70  #
71  # dl / du has dimension (batch_size, num_nodes), which we expand to
72  # (batch_size, num_nodes, 1).
73  dl_du = array_ops.expand_dims(grad, 2)
74
75  # du / df is the derivative of the output of the routing function with respect
76  # to the decision function at each node.  It is computed by
77  # routing_gradient_op.cc.
78  #
79  # du / df has dimension (batch_size, num_nodes), which we expand to
80  # (batch_size, num_nodes, 1).
81  du_df = array_ops.expand_dims(
82      routing_gradient(
83          input_data_tensor,
84          tree_weights_tensor,
85          tree_thresholds_tensor,
86          routing_function_tensor,
87          max_nodes=op.get_attr('max_nodes')),
88      2)
89
90  # df / dx is the derivative of the decision function with respect to the input
91  # data.  f_i(x) = (-t_i * x + b_i), so df_i / dx = -t_i.
92  #
93  # df / dx has dimension (num_nodes, num_features), which we expand to
94  # (1, num_nodes, num_features).
95  df_dx = -array_ops.expand_dims(tree_weights_tensor, 0)
96
97  # df / dt is the derivative of the decision function with respect to its
98  # parameters. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = -x.
99  #
100  # df / dt has dimension (batch_size, num_features), which we expand to
101  # (batch_size, 1, num_features).
102  df_dt = -array_ops.expand_dims(input_data_tensor, 1)
103  # df / dt is the derivative of the decision function with respect to its
104  # bias parameter. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = 1.
105  #
106  # df / db has dimension (num_nodes), which we expand to
107  # (1, num_nodes, 1).
108  df_db = array_ops.expand_dims(
109      array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2)
110
111  # Compute the derivatives of the loss with respect to the inputs using the
112  # chain rule (backpropagation).
113  dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1)
114  dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0)
115  dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0)
116
117  input_gradients = [dl_dx, dl_dt, dl_db]
118
119  return input_gradients
120
121
122@ops.RegisterGradient('StochasticHardRoutingFunction')
123def _StochasticHardRoutingFunctionGradient(op, routing_grad, unused_path_grad):
124  """The gradient of RoutingFunction.
125
126  Args:
127    op: The RoutingFunction op.
128    routing_grad: Gradient with respect to the output of the RoutingFunction op.
129
130  Returns:
131    Gradients with respect to the input of the RoutingFunction op.
132  """
133  gradient_op = gen_training_ops.stochastic_hard_routing_gradient
134  unpack_path_op = gen_training_ops.unpack_path
135
136  input_data_tensor = op.inputs[0]
137  tree_weights_tensor = op.inputs[1]
138  tree_thresholds_tensor = op.inputs[2]
139
140  path_probability_tensor = op.outputs[0]
141  path_tensor = op.outputs[1]
142
143  # The derivatives below are each defined over one or two of three dimensions:
144  # (batch_size, num_nodes, num_features).  We explicitly expand each derivative
145  # to three dimensions to ensure that they're broadcasted correctly.
146  du_df_raw, df_dx_raw, df_dt_raw, df_db_raw = gradient_op(
147      input_data_tensor,
148      tree_weights_tensor,
149      tree_thresholds_tensor,
150      path_probability_tensor,
151      path_tensor,
152      tree_depth=op.get_attr('tree_depth'))
153
154  # dl / du is the derivative of the loss with respect to the output of the
155  # routing function, which is provided by tensorflow.
156  #
157  # dl / du has dimension (batch_size, num_nodes), which we expand to
158  # (batch_size, num_nodes, 1).
159  dl_du = array_ops.expand_dims(unpack_path_op(path_tensor, routing_grad), 2)
160
161  # du / df is the derivative of the output of the routing function with respect
162  # to the decision function at each node.  It is computed by
163  # single_feature_routing_gradient_op.cc.
164  #
165  # du / df has dimension (batch_size, num_nodes), which we expand to
166  # (batch_size, num_nodes, 1).
167  du_df = array_ops.expand_dims(du_df_raw, 2)
168
169  # df / dx is the derivative of the decision function with respect to the input
170  # data.  f(x) = (-t * x + b), so df / dx = -t for the selected features and
171  # zero elsewhere.
172  #
173  # df / dx has dimension (num_nodes, num_features), which we expand to
174  # (1, num_nodes, num_features).
175  df_dx = array_ops.expand_dims(df_dx_raw, 0)
176
177  # df / dt is the derivative of the decision function with respect to its
178  # parameters. f(x) = (-t * x + b), so df / dt = -x[feature].
179  #
180  # df / dt has dimension (batch_size, num_nodes, num_features).
181  df_dt = -df_dt_raw
182
183  # df / dt is the derivative of the decision function with respect to its
184  # bias parameter. f(x) = (-t * x + b), so df / dt = 1.
185  #
186  # df / db has dimension (num_nodes), which we expand to
187  # (1, num_nodes, 1).
188  df_db = array_ops.expand_dims(array_ops.expand_dims(df_db_raw, 0), 2)
189
190  # Compute the derivatives of the loss with respect to the inputs using the
191  # chain rule (backpropagation).
192  dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1)
193  dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0)
194  dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0)
195
196  input_gradients = [dl_dx, dl_dt, dl_db]
197
198  return input_gradients
199
200
201@ops.RegisterGradient('KFeatureRoutingFunction')
202def _KFeatureRoutingFunctionGradient(op, grad):
203  """The gradient of RoutingFunction.
204
205  Args:
206    op: The RoutingFunction op.
207    grad: Gradient with respect to the output of the RoutingFunction op.
208
209  Returns:
210    Gradients with respect to the input of the RoutingFunction op.
211  """
212  gradient_op = gen_training_ops.k_feature_gradient
213
214  input_data_tensor = op.inputs[0]
215  tree_weights_tensor = op.inputs[1]
216  tree_thresholds_tensor = op.inputs[2]
217
218  routing_function_tensor = op.outputs[0]
219
220  # The derivatives below are each defined over one or two of three dimensions:
221  # (batch_size, num_nodes, num_features).  We explicitly expand each derivative
222  # to three dimensions to ensure that they're broadcasted correctly.
223  du_df_raw, df_dx_raw, df_dt_raw = gradient_op(
224      input_data_tensor,
225      tree_weights_tensor,
226      tree_thresholds_tensor,
227      routing_function_tensor,
228      layer_num=op.get_attr('layer_num'),
229      random_seed=op.get_attr('random_seed'))
230
231  # dl / du is the derivative of the loss with respect to the output of the
232  # routing function, which is provided by tensorflow.
233  #
234  # dl / du has dimension (batch_size, num_nodes), which we expand to
235  # (batch_size, num_nodes, 1).
236  dl_du = array_ops.expand_dims(grad, 2)
237
238  # du / df is the derivative of the output of the routing function with respect
239  # to the decision function at each node.  It is computed by
240  # single_feature_routing_gradient_op.cc.
241  #
242  # du / df has dimension (batch_size, num_nodes), which we expand to
243  # (batch_size, num_nodes, 1).
244  du_df = array_ops.expand_dims(du_df_raw, 2)
245
246  # df / dx is the derivative of the decision function with respect to the input
247  # data.  f(x) = (-t * x + b), so df / dx = -t for the selected features and
248  # zero elsewhere.
249  #
250  # df / dx has dimension (num_nodes, num_features), which we expand to
251  # (1, num_nodes, num_features).
252  df_dx = array_ops.expand_dims(df_dx_raw, 0)
253
254  # df / dt is the derivative of the decision function with respect to its
255  # parameters. f(x) = (-t * x + b), so df / dt = -x[feature].
256  #
257  # df / dt has dimension (batch_size, num_nodes, num_features).
258  df_dt = -df_dt_raw
259
260  # df / dt is the derivative of the decision function with respect to its
261  # bias parameter. f(x) = (-t * x + b), so df / dt = 1.
262  #
263  # df / db has dimension (num_nodes), which we expand to
264  # (1, num_nodes, 1).
265  df_db = array_ops.expand_dims(
266      array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2)
267
268  # Compute the derivatives of the loss with respect to the inputs using the
269  # chain rule (backpropagation).
270  dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1)
271  dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0)
272  dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0)
273
274  input_gradients = [dl_dx, dl_dt, dl_db]
275
276  return input_gradients
277
278
279# Workaround for the fact that importing tensorflow imports contrib
280# (even if a user isn't using this or any other contrib op), but
281# there's not yet any guarantee that the shared object exists.
282# In which case, "import tensorflow" will always crash, even for users that
283# never use contrib.
284def Load():
285  """Load training ops library and return the loaded module."""
286  with _ops_lock:
287    global _training_ops
288    if not _training_ops:
289      ops_path = resource_loader.get_path_to_datafile(TRAINING_OPS_FILE)
290      logging.info('data path: %s', ops_path)
291      _training_ops = loader.load_op_library(ops_path)
292
293      assert _training_ops, 'Could not load _training_ops.so'
294  return _training_ops
295