1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Ops for hybrid model training.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20import threading 21 22from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops 23from tensorflow.contrib.util import loader 24from tensorflow.python.framework import ops 25from tensorflow.python.ops import array_ops 26from tensorflow.python.ops import math_ops 27from tensorflow.python.platform import resource_loader 28from tensorflow.python.platform import tf_logging as logging 29 30TRAINING_OPS_FILE = '_training_ops.so' 31 32_training_ops = None 33_ops_lock = threading.Lock() 34 35# TODO(b/31222613): Some of these ops are probably differentiable, and 36# there may be latent bugs here. 37ops.NotDifferentiable('HardRoutingFunction') 38ops.NotDifferentiable('RoutingGradient') 39ops.NotDifferentiable('KFeatureDataGradient') 40ops.NotDifferentiable('KFeatureRoutingGradient') 41ops.NotDifferentiable('KFeatureWeightGradient') 42ops.NotDifferentiable('UnpackPath') 43 44 45@ops.RegisterGradient('RoutingFunction') 46def _RoutingFunctionGradient(op, grad): 47 """The gradient of RoutingFunction. 48 49 Args: 50 op: The RoutingFunction op. 51 grad: Gradient with respect to the output of the RoutingFunction op. 52 53 Returns: 54 Gradients with respect to the input of the RoutingFunction op. 55 """ 56 routing_gradient = gen_training_ops.routing_gradient 57 58 input_data_tensor = op.inputs[0] 59 tree_weights_tensor = op.inputs[1] 60 tree_thresholds_tensor = op.inputs[2] 61 62 routing_function_tensor = op.outputs[0] 63 64 # The derivatives below are each defined over one or two of three dimensions: 65 # (batch_size, num_nodes, num_features). We explicitly expand each derivative 66 # to three dimensions to ensure that they're broadcasted correctly. 67 68 # dl / du is the derivative of the loss with respect to the output of the 69 # routing function, which is provided by tensorflow. 70 # 71 # dl / du has dimension (batch_size, num_nodes), which we expand to 72 # (batch_size, num_nodes, 1). 73 dl_du = array_ops.expand_dims(grad, 2) 74 75 # du / df is the derivative of the output of the routing function with respect 76 # to the decision function at each node. It is computed by 77 # routing_gradient_op.cc. 78 # 79 # du / df has dimension (batch_size, num_nodes), which we expand to 80 # (batch_size, num_nodes, 1). 81 du_df = array_ops.expand_dims( 82 routing_gradient( 83 input_data_tensor, 84 tree_weights_tensor, 85 tree_thresholds_tensor, 86 routing_function_tensor, 87 max_nodes=op.get_attr('max_nodes')), 88 2) 89 90 # df / dx is the derivative of the decision function with respect to the input 91 # data. f_i(x) = (-t_i * x + b_i), so df_i / dx = -t_i. 92 # 93 # df / dx has dimension (num_nodes, num_features), which we expand to 94 # (1, num_nodes, num_features). 95 df_dx = -array_ops.expand_dims(tree_weights_tensor, 0) 96 97 # df / dt is the derivative of the decision function with respect to its 98 # parameters. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = -x. 99 # 100 # df / dt has dimension (batch_size, num_features), which we expand to 101 # (batch_size, 1, num_features). 102 df_dt = -array_ops.expand_dims(input_data_tensor, 1) 103 # df / dt is the derivative of the decision function with respect to its 104 # bias parameter. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = 1. 105 # 106 # df / db has dimension (num_nodes), which we expand to 107 # (1, num_nodes, 1). 108 df_db = array_ops.expand_dims( 109 array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2) 110 111 # Compute the derivatives of the loss with respect to the inputs using the 112 # chain rule (backpropagation). 113 dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1) 114 dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0) 115 dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0) 116 117 input_gradients = [dl_dx, dl_dt, dl_db] 118 119 return input_gradients 120 121 122@ops.RegisterGradient('StochasticHardRoutingFunction') 123def _StochasticHardRoutingFunctionGradient(op, routing_grad, unused_path_grad): 124 """The gradient of RoutingFunction. 125 126 Args: 127 op: The RoutingFunction op. 128 routing_grad: Gradient with respect to the output of the RoutingFunction op. 129 130 Returns: 131 Gradients with respect to the input of the RoutingFunction op. 132 """ 133 gradient_op = gen_training_ops.stochastic_hard_routing_gradient 134 unpack_path_op = gen_training_ops.unpack_path 135 136 input_data_tensor = op.inputs[0] 137 tree_weights_tensor = op.inputs[1] 138 tree_thresholds_tensor = op.inputs[2] 139 140 path_probability_tensor = op.outputs[0] 141 path_tensor = op.outputs[1] 142 143 # The derivatives below are each defined over one or two of three dimensions: 144 # (batch_size, num_nodes, num_features). We explicitly expand each derivative 145 # to three dimensions to ensure that they're broadcasted correctly. 146 du_df_raw, df_dx_raw, df_dt_raw, df_db_raw = gradient_op( 147 input_data_tensor, 148 tree_weights_tensor, 149 tree_thresholds_tensor, 150 path_probability_tensor, 151 path_tensor, 152 tree_depth=op.get_attr('tree_depth')) 153 154 # dl / du is the derivative of the loss with respect to the output of the 155 # routing function, which is provided by tensorflow. 156 # 157 # dl / du has dimension (batch_size, num_nodes), which we expand to 158 # (batch_size, num_nodes, 1). 159 dl_du = array_ops.expand_dims(unpack_path_op(path_tensor, routing_grad), 2) 160 161 # du / df is the derivative of the output of the routing function with respect 162 # to the decision function at each node. It is computed by 163 # single_feature_routing_gradient_op.cc. 164 # 165 # du / df has dimension (batch_size, num_nodes), which we expand to 166 # (batch_size, num_nodes, 1). 167 du_df = array_ops.expand_dims(du_df_raw, 2) 168 169 # df / dx is the derivative of the decision function with respect to the input 170 # data. f(x) = (-t * x + b), so df / dx = -t for the selected features and 171 # zero elsewhere. 172 # 173 # df / dx has dimension (num_nodes, num_features), which we expand to 174 # (1, num_nodes, num_features). 175 df_dx = array_ops.expand_dims(df_dx_raw, 0) 176 177 # df / dt is the derivative of the decision function with respect to its 178 # parameters. f(x) = (-t * x + b), so df / dt = -x[feature]. 179 # 180 # df / dt has dimension (batch_size, num_nodes, num_features). 181 df_dt = -df_dt_raw 182 183 # df / dt is the derivative of the decision function with respect to its 184 # bias parameter. f(x) = (-t * x + b), so df / dt = 1. 185 # 186 # df / db has dimension (num_nodes), which we expand to 187 # (1, num_nodes, 1). 188 df_db = array_ops.expand_dims(array_ops.expand_dims(df_db_raw, 0), 2) 189 190 # Compute the derivatives of the loss with respect to the inputs using the 191 # chain rule (backpropagation). 192 dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1) 193 dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0) 194 dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0) 195 196 input_gradients = [dl_dx, dl_dt, dl_db] 197 198 return input_gradients 199 200 201@ops.RegisterGradient('KFeatureRoutingFunction') 202def _KFeatureRoutingFunctionGradient(op, grad): 203 """The gradient of RoutingFunction. 204 205 Args: 206 op: The RoutingFunction op. 207 grad: Gradient with respect to the output of the RoutingFunction op. 208 209 Returns: 210 Gradients with respect to the input of the RoutingFunction op. 211 """ 212 gradient_op = gen_training_ops.k_feature_gradient 213 214 input_data_tensor = op.inputs[0] 215 tree_weights_tensor = op.inputs[1] 216 tree_thresholds_tensor = op.inputs[2] 217 218 routing_function_tensor = op.outputs[0] 219 220 # The derivatives below are each defined over one or two of three dimensions: 221 # (batch_size, num_nodes, num_features). We explicitly expand each derivative 222 # to three dimensions to ensure that they're broadcasted correctly. 223 du_df_raw, df_dx_raw, df_dt_raw = gradient_op( 224 input_data_tensor, 225 tree_weights_tensor, 226 tree_thresholds_tensor, 227 routing_function_tensor, 228 layer_num=op.get_attr('layer_num'), 229 random_seed=op.get_attr('random_seed')) 230 231 # dl / du is the derivative of the loss with respect to the output of the 232 # routing function, which is provided by tensorflow. 233 # 234 # dl / du has dimension (batch_size, num_nodes), which we expand to 235 # (batch_size, num_nodes, 1). 236 dl_du = array_ops.expand_dims(grad, 2) 237 238 # du / df is the derivative of the output of the routing function with respect 239 # to the decision function at each node. It is computed by 240 # single_feature_routing_gradient_op.cc. 241 # 242 # du / df has dimension (batch_size, num_nodes), which we expand to 243 # (batch_size, num_nodes, 1). 244 du_df = array_ops.expand_dims(du_df_raw, 2) 245 246 # df / dx is the derivative of the decision function with respect to the input 247 # data. f(x) = (-t * x + b), so df / dx = -t for the selected features and 248 # zero elsewhere. 249 # 250 # df / dx has dimension (num_nodes, num_features), which we expand to 251 # (1, num_nodes, num_features). 252 df_dx = array_ops.expand_dims(df_dx_raw, 0) 253 254 # df / dt is the derivative of the decision function with respect to its 255 # parameters. f(x) = (-t * x + b), so df / dt = -x[feature]. 256 # 257 # df / dt has dimension (batch_size, num_nodes, num_features). 258 df_dt = -df_dt_raw 259 260 # df / dt is the derivative of the decision function with respect to its 261 # bias parameter. f(x) = (-t * x + b), so df / dt = 1. 262 # 263 # df / db has dimension (num_nodes), which we expand to 264 # (1, num_nodes, 1). 265 df_db = array_ops.expand_dims( 266 array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2) 267 268 # Compute the derivatives of the loss with respect to the inputs using the 269 # chain rule (backpropagation). 270 dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1) 271 dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0) 272 dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0) 273 274 input_gradients = [dl_dx, dl_dt, dl_db] 275 276 return input_gradients 277 278 279# Workaround for the fact that importing tensorflow imports contrib 280# (even if a user isn't using this or any other contrib op), but 281# there's not yet any guarantee that the shared object exists. 282# In which case, "import tensorflow" will always crash, even for users that 283# never use contrib. 284def Load(): 285 """Load training ops library and return the loaded module.""" 286 with _ops_lock: 287 global _training_ops 288 if not _training_ops: 289 ops_path = resource_loader.get_path_to_datafile(TRAINING_OPS_FILE) 290 logging.info('data path: %s', ops_path) 291 _training_ops = loader.load_op_library(ops_path) 292 293 assert _training_ops, 'Could not load _training_ops.so' 294 return _training_ops 295