1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15# pylint: disable=invalid-name 16# pylint: disable=missing-function-docstring 17"""MobileNet v3 models for Keras.""" 18from __future__ import absolute_import 19from __future__ import division 20from __future__ import print_function 21 22 23from tensorflow.python.keras import backend 24from tensorflow.python.keras import models 25from tensorflow.python.keras.applications import imagenet_utils 26from tensorflow.python.keras.layers import VersionAwareLayers 27from tensorflow.python.keras.utils import data_utils 28from tensorflow.python.keras.utils import layer_utils 29from tensorflow.python.lib.io import file_io 30from tensorflow.python.platform import tf_logging as logging 31from tensorflow.python.util.tf_export import keras_export 32 33 34# TODO(scottzhu): Change this to the GCS path. 35BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/' 36 'keras-applications/mobilenet_v3/') 37WEIGHTS_HASHES = { 38 'large_224_0.75_float': ('765b44a33ad4005b3ac83185abf1d0eb', 39 'e7b4d1071996dd51a2c2ca2424570e20'), 40 'large_224_1.0_float': ('59e551e166be033d707958cf9e29a6a7', 41 '037116398e07f018c0005ffcb0406831'), 42 'large_minimalistic_224_1.0_float': ('675e7b876c45c57e9e63e6d90a36599c', 43 'a2c33aed672524d1d0b4431808177695'), 44 'small_224_0.75_float': ('cb65d4e5be93758266aa0a7f2c6708b7', 45 '4d2fe46f1c1f38057392514b0df1d673'), 46 'small_224_1.0_float': ('8768d4c2e7dee89b9d02b2d03d65d862', 47 'be7100780f875c06bcab93d76641aa26'), 48 'small_minimalistic_224_1.0_float': ('99cd97fb2fcdad2bf028eb838de69e37', 49 '20d4e357df3f7a6361f3a288857b1051'), 50} 51 52layers = VersionAwareLayers() 53 54 55BASE_DOCSTRING = """Instantiates the {name} architecture. 56 57 Reference: 58 - [Searching for MobileNetV3]( 59 https://arxiv.org/pdf/1905.02244.pdf) (ICCV 2019) 60 61 The following table describes the performance of MobileNets: 62 ------------------------------------------------------------------------ 63 MACs stands for Multiply Adds 64 65 |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1 CPU(ms)| 66 |---|---|---|---|---| 67 | mobilenet_v3_large_1.0_224 | 217 | 5.4 | 75.6 | 51.2 | 68 | mobilenet_v3_large_0.75_224 | 155 | 4.0 | 73.3 | 39.8 | 69 | mobilenet_v3_large_minimalistic_1.0_224 | 209 | 3.9 | 72.3 | 44.1 | 70 | mobilenet_v3_small_1.0_224 | 66 | 2.9 | 68.1 | 15.8 | 71 | mobilenet_v3_small_0.75_224 | 44 | 2.4 | 65.4 | 12.8 | 72 | mobilenet_v3_small_minimalistic_1.0_224 | 65 | 2.0 | 61.9 | 12.2 | 73 74 The weights for all 6 models are obtained and translated from the Tensorflow 75 checkpoints from TensorFlow checkpoints found [here] 76 (https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet/README.md). 77 78 Optionally loads weights pre-trained on ImageNet. 79 80 Args: 81 input_shape: Optional shape tuple, to be specified if you would 82 like to use a model with an input image resolution that is not 83 (224, 224, 3). 84 It should have exactly 3 inputs channels (224, 224, 3). 85 You can also omit this option if you would like 86 to infer input_shape from an input_tensor. 87 If you choose to include both input_tensor and input_shape then 88 input_shape will be used if they match, if the shapes 89 do not match then we will throw an error. 90 E.g. `(160, 160, 3)` would be one valid value. 91 alpha: controls the width of the network. This is known as the 92 depth multiplier in the MobileNetV3 paper, but the name is kept for 93 consistency with MobileNetV1 in Keras. 94 - If `alpha` < 1.0, proportionally decreases the number 95 of filters in each layer. 96 - If `alpha` > 1.0, proportionally increases the number 97 of filters in each layer. 98 - If `alpha` = 1, default number of filters from the paper 99 are used at each layer. 100 minimalistic: In addition to large and small models this module also 101 contains so-called minimalistic models, these models have the same 102 per-layer dimensions characteristic as MobilenetV3 however, they don't 103 utilize any of the advanced blocks (squeeze-and-excite units, hard-swish, 104 and 5x5 convolutions). While these models are less efficient on CPU, they 105 are much more performant on GPU/DSP. 106 include_top: Boolean, whether to include the fully-connected 107 layer at the top of the network. Defaults to `True`. 108 weights: String, one of `None` (random initialization), 109 'imagenet' (pre-training on ImageNet), 110 or the path to the weights file to be loaded. 111 input_tensor: Optional Keras tensor (i.e. output of 112 `layers.Input()`) 113 to use as image input for the model. 114 pooling: String, optional pooling mode for feature extraction 115 when `include_top` is `False`. 116 - `None` means that the output of the model 117 will be the 4D tensor output of the 118 last convolutional block. 119 - `avg` means that global average pooling 120 will be applied to the output of the 121 last convolutional block, and thus 122 the output of the model will be a 123 2D tensor. 124 - `max` means that global max pooling will 125 be applied. 126 classes: Integer, optional number of classes to classify images 127 into, only to be specified if `include_top` is True, and 128 if no `weights` argument is specified. 129 dropout_rate: fraction of the input units to drop on the last layer. 130 classifier_activation: A `str` or callable. The activation function to use 131 on the "top" layer. Ignored unless `include_top=True`. Set 132 `classifier_activation=None` to return the logits of the "top" layer. 133 134 Call arguments: 135 inputs: A floating point `numpy.array` or a `tf.Tensor`, 4D with 3 color 136 channels, with values in the range [0, 255]. 137 138 Returns: 139 A `keras.Model` instance. 140 141 Raises: 142 ValueError: in case of invalid argument for `weights`, 143 or invalid input shape or invalid alpha, rows when 144 weights='imagenet' 145 ValueError: if `classifier_activation` is not `softmax` or `None` when 146 using a pretrained top layer. 147""" 148 149 150def MobileNetV3(stack_fn, 151 last_point_ch, 152 input_shape=None, 153 alpha=1.0, 154 model_type='large', 155 minimalistic=False, 156 include_top=True, 157 weights='imagenet', 158 input_tensor=None, 159 classes=1000, 160 pooling=None, 161 dropout_rate=0.2, 162 classifier_activation='softmax'): 163 if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)): 164 raise ValueError('The `weights` argument should be either ' 165 '`None` (random initialization), `imagenet` ' 166 '(pre-training on ImageNet), ' 167 'or the path to the weights file to be loaded.') 168 169 if weights == 'imagenet' and include_top and classes != 1000: 170 raise ValueError('If using `weights` as `"imagenet"` with `include_top` ' 171 'as true, `classes` should be 1000') 172 173 # Determine proper input shape and default size. 174 # If both input_shape and input_tensor are used, they should match 175 if input_shape is not None and input_tensor is not None: 176 try: 177 is_input_t_tensor = backend.is_keras_tensor(input_tensor) 178 except ValueError: 179 try: 180 is_input_t_tensor = backend.is_keras_tensor( 181 layer_utils.get_source_inputs(input_tensor)) 182 except ValueError: 183 raise ValueError('input_tensor: ', input_tensor, 184 'is not type input_tensor') 185 if is_input_t_tensor: 186 if backend.image_data_format() == 'channels_first': 187 if backend.int_shape(input_tensor)[1] != input_shape[1]: 188 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 189 input_tensor, 190 'do not meet the same shape requirements') 191 else: 192 if backend.int_shape(input_tensor)[2] != input_shape[1]: 193 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 194 input_tensor, 195 'do not meet the same shape requirements') 196 else: 197 raise ValueError('input_tensor specified: ', input_tensor, 198 'is not a keras tensor') 199 200 # If input_shape is None, infer shape from input_tensor 201 if input_shape is None and input_tensor is not None: 202 203 try: 204 backend.is_keras_tensor(input_tensor) 205 except ValueError: 206 raise ValueError('input_tensor: ', input_tensor, 'is type: ', 207 type(input_tensor), 'which is not a valid type') 208 209 if backend.is_keras_tensor(input_tensor): 210 if backend.image_data_format() == 'channels_first': 211 rows = backend.int_shape(input_tensor)[2] 212 cols = backend.int_shape(input_tensor)[3] 213 input_shape = (3, cols, rows) 214 else: 215 rows = backend.int_shape(input_tensor)[1] 216 cols = backend.int_shape(input_tensor)[2] 217 input_shape = (cols, rows, 3) 218 # If input_shape is None and input_tensor is None using standart shape 219 if input_shape is None and input_tensor is None: 220 input_shape = (None, None, 3) 221 222 if backend.image_data_format() == 'channels_last': 223 row_axis, col_axis = (0, 1) 224 else: 225 row_axis, col_axis = (1, 2) 226 rows = input_shape[row_axis] 227 cols = input_shape[col_axis] 228 if rows and cols and (rows < 32 or cols < 32): 229 raise ValueError('Input size must be at least 32x32; got `input_shape=' + 230 str(input_shape) + '`') 231 if weights == 'imagenet': 232 if (not minimalistic and alpha not in [0.75, 1.0] 233 or minimalistic and alpha != 1.0): 234 raise ValueError('If imagenet weights are being loaded, ' 235 'alpha can be one of `0.75`, `1.0` for non minimalistic' 236 ' or `1.0` for minimalistic only.') 237 238 if rows != cols or rows != 224: 239 logging.warning('`input_shape` is undefined or non-square, ' 240 'or `rows` is not 224.' 241 ' Weights for input shape (224, 224) will be' 242 ' loaded as the default.') 243 244 if input_tensor is None: 245 img_input = layers.Input(shape=input_shape) 246 else: 247 if not backend.is_keras_tensor(input_tensor): 248 img_input = layers.Input(tensor=input_tensor, shape=input_shape) 249 else: 250 img_input = input_tensor 251 252 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 253 254 if minimalistic: 255 kernel = 3 256 activation = relu 257 se_ratio = None 258 else: 259 kernel = 5 260 activation = hard_swish 261 se_ratio = 0.25 262 263 x = img_input 264 x = layers.Rescaling(scale=1. / 127.5, offset=-1.)(x) 265 x = layers.Conv2D( 266 16, 267 kernel_size=3, 268 strides=(2, 2), 269 padding='same', 270 use_bias=False, 271 name='Conv')(x) 272 x = layers.BatchNormalization( 273 axis=channel_axis, epsilon=1e-3, 274 momentum=0.999, name='Conv/BatchNorm')(x) 275 x = activation(x) 276 277 x = stack_fn(x, kernel, activation, se_ratio) 278 279 last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6) 280 281 # if the width multiplier is greater than 1 we 282 # increase the number of output channels 283 if alpha > 1.0: 284 last_point_ch = _depth(last_point_ch * alpha) 285 x = layers.Conv2D( 286 last_conv_ch, 287 kernel_size=1, 288 padding='same', 289 use_bias=False, 290 name='Conv_1')(x) 291 x = layers.BatchNormalization( 292 axis=channel_axis, epsilon=1e-3, 293 momentum=0.999, name='Conv_1/BatchNorm')(x) 294 x = activation(x) 295 x = layers.Conv2D( 296 last_point_ch, 297 kernel_size=1, 298 padding='same', 299 use_bias=True, 300 name='Conv_2')(x) 301 x = activation(x) 302 303 if include_top: 304 x = layers.GlobalAveragePooling2D()(x) 305 if channel_axis == 1: 306 x = layers.Reshape((last_point_ch, 1, 1))(x) 307 else: 308 x = layers.Reshape((1, 1, last_point_ch))(x) 309 if dropout_rate > 0: 310 x = layers.Dropout(dropout_rate)(x) 311 x = layers.Conv2D(classes, kernel_size=1, padding='same', name='Logits')(x) 312 x = layers.Flatten()(x) 313 imagenet_utils.validate_activation(classifier_activation, weights) 314 x = layers.Activation(activation=classifier_activation, 315 name='Predictions')(x) 316 else: 317 if pooling == 'avg': 318 x = layers.GlobalAveragePooling2D(name='avg_pool')(x) 319 elif pooling == 'max': 320 x = layers.GlobalMaxPooling2D(name='max_pool')(x) 321 # Ensure that the model takes into account 322 # any potential predecessors of `input_tensor`. 323 if input_tensor is not None: 324 inputs = layer_utils.get_source_inputs(input_tensor) 325 else: 326 inputs = img_input 327 328 # Create model. 329 model = models.Model(inputs, x, name='MobilenetV3' + model_type) 330 331 # Load weights. 332 if weights == 'imagenet': 333 model_name = '{}{}_224_{}_float'.format( 334 model_type, '_minimalistic' if minimalistic else '', str(alpha)) 335 if include_top: 336 file_name = 'weights_mobilenet_v3_' + model_name + '.h5' 337 file_hash = WEIGHTS_HASHES[model_name][0] 338 else: 339 file_name = 'weights_mobilenet_v3_' + model_name + '_no_top.h5' 340 file_hash = WEIGHTS_HASHES[model_name][1] 341 weights_path = data_utils.get_file( 342 file_name, 343 BASE_WEIGHT_PATH + file_name, 344 cache_subdir='models', 345 file_hash=file_hash) 346 model.load_weights(weights_path) 347 elif weights is not None: 348 model.load_weights(weights) 349 350 return model 351 352 353@keras_export('keras.applications.MobileNetV3Small') 354def MobileNetV3Small(input_shape=None, 355 alpha=1.0, 356 minimalistic=False, 357 include_top=True, 358 weights='imagenet', 359 input_tensor=None, 360 classes=1000, 361 pooling=None, 362 dropout_rate=0.2, 363 classifier_activation='softmax'): 364 365 def stack_fn(x, kernel, activation, se_ratio): 366 367 def depth(d): 368 return _depth(d * alpha) 369 370 x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0) 371 x = _inverted_res_block(x, 72. / 16, depth(24), 3, 2, None, relu, 1) 372 x = _inverted_res_block(x, 88. / 24, depth(24), 3, 1, None, relu, 2) 373 x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3) 374 x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4) 375 x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5) 376 x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6) 377 x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7) 378 x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8) 379 x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9) 380 x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 381 10) 382 return x 383 384 return MobileNetV3(stack_fn, 1024, input_shape, alpha, 'small', minimalistic, 385 include_top, weights, input_tensor, classes, pooling, 386 dropout_rate, classifier_activation) 387 388 389@keras_export('keras.applications.MobileNetV3Large') 390def MobileNetV3Large(input_shape=None, 391 alpha=1.0, 392 minimalistic=False, 393 include_top=True, 394 weights='imagenet', 395 input_tensor=None, 396 classes=1000, 397 pooling=None, 398 dropout_rate=0.2, 399 classifier_activation='softmax'): 400 401 def stack_fn(x, kernel, activation, se_ratio): 402 403 def depth(d): 404 return _depth(d * alpha) 405 406 x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0) 407 x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1) 408 x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2) 409 x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3) 410 x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4) 411 x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5) 412 x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6) 413 x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7) 414 x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8) 415 x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9) 416 x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10) 417 x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11) 418 x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation, 419 12) 420 x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation, 421 13) 422 x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation, 423 14) 424 return x 425 426 return MobileNetV3(stack_fn, 1280, input_shape, alpha, 'large', minimalistic, 427 include_top, weights, input_tensor, classes, pooling, 428 dropout_rate, classifier_activation) 429 430 431MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Small') 432MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Large') 433 434 435def relu(x): 436 return layers.ReLU()(x) 437 438 439def hard_sigmoid(x): 440 return layers.ReLU(6.)(x + 3.) * (1. / 6.) 441 442 443def hard_swish(x): 444 return layers.Multiply()([hard_sigmoid(x), x]) 445 446 447# This function is taken from the original tf repo. 448# It ensures that all layers have a channel number that is divisible by 8 449# It can be seen here: 450# https://github.com/tensorflow/models/blob/master/research/ 451# slim/nets/mobilenet/mobilenet.py 452 453 454def _depth(v, divisor=8, min_value=None): 455 if min_value is None: 456 min_value = divisor 457 new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 458 # Make sure that round down does not go down by more than 10%. 459 if new_v < 0.9 * v: 460 new_v += divisor 461 return new_v 462 463 464def _se_block(inputs, filters, se_ratio, prefix): 465 x = layers.GlobalAveragePooling2D(name=prefix + 'squeeze_excite/AvgPool')( 466 inputs) 467 if backend.image_data_format() == 'channels_first': 468 x = layers.Reshape((filters, 1, 1))(x) 469 else: 470 x = layers.Reshape((1, 1, filters))(x) 471 x = layers.Conv2D( 472 _depth(filters * se_ratio), 473 kernel_size=1, 474 padding='same', 475 name=prefix + 'squeeze_excite/Conv')( 476 x) 477 x = layers.ReLU(name=prefix + 'squeeze_excite/Relu')(x) 478 x = layers.Conv2D( 479 filters, 480 kernel_size=1, 481 padding='same', 482 name=prefix + 'squeeze_excite/Conv_1')( 483 x) 484 x = hard_sigmoid(x) 485 x = layers.Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x]) 486 return x 487 488 489def _inverted_res_block(x, expansion, filters, kernel_size, stride, se_ratio, 490 activation, block_id): 491 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 492 shortcut = x 493 prefix = 'expanded_conv/' 494 infilters = backend.int_shape(x)[channel_axis] 495 if block_id: 496 # Expand 497 prefix = 'expanded_conv_{}/'.format(block_id) 498 x = layers.Conv2D( 499 _depth(infilters * expansion), 500 kernel_size=1, 501 padding='same', 502 use_bias=False, 503 name=prefix + 'expand')( 504 x) 505 x = layers.BatchNormalization( 506 axis=channel_axis, 507 epsilon=1e-3, 508 momentum=0.999, 509 name=prefix + 'expand/BatchNorm')( 510 x) 511 x = activation(x) 512 513 if stride == 2: 514 x = layers.ZeroPadding2D( 515 padding=imagenet_utils.correct_pad(x, kernel_size), 516 name=prefix + 'depthwise/pad')( 517 x) 518 x = layers.DepthwiseConv2D( 519 kernel_size, 520 strides=stride, 521 padding='same' if stride == 1 else 'valid', 522 use_bias=False, 523 name=prefix + 'depthwise')( 524 x) 525 x = layers.BatchNormalization( 526 axis=channel_axis, 527 epsilon=1e-3, 528 momentum=0.999, 529 name=prefix + 'depthwise/BatchNorm')( 530 x) 531 x = activation(x) 532 533 if se_ratio: 534 x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix) 535 536 x = layers.Conv2D( 537 filters, 538 kernel_size=1, 539 padding='same', 540 use_bias=False, 541 name=prefix + 'project')( 542 x) 543 x = layers.BatchNormalization( 544 axis=channel_axis, 545 epsilon=1e-3, 546 momentum=0.999, 547 name=prefix + 'project/BatchNorm')( 548 x) 549 550 if stride == 1 and infilters == filters: 551 x = layers.Add(name=prefix + 'Add')([shortcut, x]) 552 return x 553 554 555@keras_export('keras.applications.mobilenet_v3.preprocess_input') 556def preprocess_input(x, data_format=None): # pylint: disable=unused-argument 557 """A placeholder method for backward compatibility. 558 559 The preprocessing logic has been included in the mobilenet_v3 model 560 implementation. Users are no longer required to call this method to normalize 561 the input data. This method does nothing and only kept as a placeholder to 562 align the API surface between old and new version of model. 563 564 Args: 565 x: A floating point `numpy.array` or a `tf.Tensor`. 566 data_format: Optional data format of the image tensor/array. Defaults to 567 None, in which case the global setting 568 `tf.keras.backend.image_data_format()` is used (unless you changed it, 569 it defaults to "channels_last").{mode} 570 571 Returns: 572 Unchanged `numpy.array` or `tf.Tensor`. 573 """ 574 575 return x 576 577 578@keras_export('keras.applications.mobilenet_v3.decode_predictions') 579def decode_predictions(preds, top=5): 580 return imagenet_utils.decode_predictions(preds, top=top) 581 582 583decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__ 584