1# Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15# pylint: disable=invalid-name 16"""MobileNet v2 models for Keras. 17 18MobileNetV2 is a general architecture and can be used for multiple use cases. 19Depending on the use case, it can use different input layer size and 20different width factors. This allows different width models to reduce 21the number of multiply-adds and thereby 22reduce inference cost on mobile devices. 23 24MobileNetV2 is very similar to the original MobileNet, 25except that it uses inverted residual blocks with 26bottlenecking features. It has a drastically lower 27parameter count than the original MobileNet. 28MobileNets support any input size greater 29than 32 x 32, with larger image sizes 30offering better performance. 31 32The number of parameters and number of multiply-adds 33can be modified by using the `alpha` parameter, 34which increases/decreases the number of filters in each layer. 35By altering the image size and `alpha` parameter, 36all 22 models from the paper can be built, with ImageNet weights provided. 37 38The paper demonstrates the performance of MobileNets using `alpha` values of 391.0 (also called 100 % MobileNet), 0.35, 0.5, 0.75, 1.0, 1.3, and 1.4 40For each of these `alpha` values, weights for 5 different input image sizes 41are provided (224, 192, 160, 128, and 96). 42 43The following table describes the performance of 44MobileNet on various input sizes: 45------------------------------------------------------------------------ 46MACs stands for Multiply Adds 47 Classification Checkpoint|MACs (M)|Parameters (M)|Top 1 Accuracy|Top 5 Accuracy 48--------------------------|------------|---------------|---------|----|--------- 49| [mobilenet_v2_1.4_224] | 582 | 6.06 | 75.0 | 92.5 | 50| [mobilenet_v2_1.3_224] | 509 | 5.34 | 74.4 | 92.1 | 51| [mobilenet_v2_1.0_224] | 300 | 3.47 | 71.8 | 91.0 | 52| [mobilenet_v2_1.0_192] | 221 | 3.47 | 70.7 | 90.1 | 53| [mobilenet_v2_1.0_160] | 154 | 3.47 | 68.8 | 89.0 | 54| [mobilenet_v2_1.0_128] | 99 | 3.47 | 65.3 | 86.9 | 55| [mobilenet_v2_1.0_96] | 56 | 3.47 | 60.3 | 83.2 | 56| [mobilenet_v2_0.75_224] | 209 | 2.61 | 69.8 | 89.6 | 57| [mobilenet_v2_0.75_192] | 153 | 2.61 | 68.7 | 88.9 | 58| [mobilenet_v2_0.75_160] | 107 | 2.61 | 66.4 | 87.3 | 59| [mobilenet_v2_0.75_128] | 69 | 2.61 | 63.2 | 85.3 | 60| [mobilenet_v2_0.75_96] | 39 | 2.61 | 58.8 | 81.6 | 61| [mobilenet_v2_0.5_224] | 97 | 1.95 | 65.4 | 86.4 | 62| [mobilenet_v2_0.5_192] | 71 | 1.95 | 63.9 | 85.4 | 63| [mobilenet_v2_0.5_160] | 50 | 1.95 | 61.0 | 83.2 | 64| [mobilenet_v2_0.5_128] | 32 | 1.95 | 57.7 | 80.8 | 65| [mobilenet_v2_0.5_96] | 18 | 1.95 | 51.2 | 75.8 | 66| [mobilenet_v2_0.35_224] | 59 | 1.66 | 60.3 | 82.9 | 67| [mobilenet_v2_0.35_192] | 43 | 1.66 | 58.2 | 81.2 | 68| [mobilenet_v2_0.35_160] | 30 | 1.66 | 55.7 | 79.1 | 69| [mobilenet_v2_0.35_128] | 20 | 1.66 | 50.8 | 75.0 | 70| [mobilenet_v2_0.35_96] | 11 | 1.66 | 45.5 | 70.4 | 71 72 Reference: 73 - [MobileNetV2: Inverted Residuals and Linear Bottlenecks]( 74 https://arxiv.org/abs/1801.04381) (CVPR 2018) 75""" 76 77from tensorflow.python.keras import backend 78from tensorflow.python.keras.applications import imagenet_utils 79from tensorflow.python.keras.engine import training 80from tensorflow.python.keras.layers import VersionAwareLayers 81from tensorflow.python.keras.utils import data_utils 82from tensorflow.python.keras.utils import layer_utils 83from tensorflow.python.lib.io import file_io 84from tensorflow.python.platform import tf_logging as logging 85from tensorflow.python.util.tf_export import keras_export 86 87BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/' 88 'keras-applications/mobilenet_v2/') 89layers = None 90 91 92@keras_export('keras.applications.mobilenet_v2.MobileNetV2', 93 'keras.applications.MobileNetV2') 94def MobileNetV2(input_shape=None, 95 alpha=1.0, 96 include_top=True, 97 weights='imagenet', 98 input_tensor=None, 99 pooling=None, 100 classes=1000, 101 classifier_activation='softmax', 102 **kwargs): 103 """Instantiates the MobileNetV2 architecture. 104 105 MobileNetV2 is very similar to the original MobileNet, 106 except that it uses inverted residual blocks with 107 bottlenecking features. It has a drastically lower 108 parameter count than the original MobileNet. 109 MobileNets support any input size greater 110 than 32 x 32, with larger image sizes 111 offering better performance. 112 113 Reference: 114 - [MobileNetV2: Inverted Residuals and Linear Bottlenecks]( 115 https://arxiv.org/abs/1801.04381) (CVPR 2018) 116 117 This function returns a Keras image classification model, 118 optionally loaded with weights pre-trained on ImageNet. 119 120 For image classification use cases, see 121 [this page for detailed examples]( 122 https://keras.io/api/applications/#usage-examples-for-image-classification-models). 123 124 For transfer learning use cases, make sure to read the 125 [guide to transfer learning & fine-tuning]( 126 https://keras.io/guides/transfer_learning/). 127 128 Note: each Keras Application expects a specific kind of input preprocessing. 129 For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input` 130 on your inputs before passing them to the model. 131 `mobilenet_v2.preprocess_input` will scale input pixels between -1 and 1. 132 133 Args: 134 input_shape: Optional shape tuple, to be specified if you would 135 like to use a model with an input image resolution that is not 136 (224, 224, 3). 137 It should have exactly 3 inputs channels (224, 224, 3). 138 You can also omit this option if you would like 139 to infer input_shape from an input_tensor. 140 If you choose to include both input_tensor and input_shape then 141 input_shape will be used if they match, if the shapes 142 do not match then we will throw an error. 143 E.g. `(160, 160, 3)` would be one valid value. 144 alpha: Float between 0 and 1. controls the width of the network. 145 This is known as the width multiplier in the MobileNetV2 paper, 146 but the name is kept for consistency with `applications.MobileNetV1` 147 model in Keras. 148 - If `alpha` < 1.0, proportionally decreases the number 149 of filters in each layer. 150 - If `alpha` > 1.0, proportionally increases the number 151 of filters in each layer. 152 - If `alpha` = 1.0, default number of filters from the paper 153 are used at each layer. 154 include_top: Boolean, whether to include the fully-connected 155 layer at the top of the network. Defaults to `True`. 156 weights: String, one of `None` (random initialization), 157 'imagenet' (pre-training on ImageNet), 158 or the path to the weights file to be loaded. 159 input_tensor: Optional Keras tensor (i.e. output of 160 `layers.Input()`) 161 to use as image input for the model. 162 pooling: String, optional pooling mode for feature extraction 163 when `include_top` is `False`. 164 - `None` means that the output of the model 165 will be the 4D tensor output of the 166 last convolutional block. 167 - `avg` means that global average pooling 168 will be applied to the output of the 169 last convolutional block, and thus 170 the output of the model will be a 171 2D tensor. 172 - `max` means that global max pooling will 173 be applied. 174 classes: Integer, optional number of classes to classify images 175 into, only to be specified if `include_top` is True, and 176 if no `weights` argument is specified. 177 classifier_activation: A `str` or callable. The activation function to use 178 on the "top" layer. Ignored unless `include_top=True`. Set 179 `classifier_activation=None` to return the logits of the "top" layer. 180 When loading pretrained weights, `classifier_activation` can only 181 be `None` or `"softmax"`. 182 **kwargs: For backwards compatibility only. 183 184 Returns: 185 A `keras.Model` instance. 186 """ 187 global layers 188 if 'layers' in kwargs: 189 layers = kwargs.pop('layers') 190 else: 191 layers = VersionAwareLayers() 192 if kwargs: 193 raise ValueError('Unknown argument(s): %s' % (kwargs,)) 194 if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)): 195 raise ValueError('The `weights` argument should be either ' 196 '`None` (random initialization), `imagenet` ' 197 '(pre-training on ImageNet), ' 198 'or the path to the weights file to be loaded.') 199 200 if weights == 'imagenet' and include_top and classes != 1000: 201 raise ValueError('If using `weights` as `"imagenet"` with `include_top` ' 202 'as true, `classes` should be 1000') 203 204 # Determine proper input shape and default size. 205 # If both input_shape and input_tensor are used, they should match 206 if input_shape is not None and input_tensor is not None: 207 try: 208 is_input_t_tensor = backend.is_keras_tensor(input_tensor) 209 except ValueError: 210 try: 211 is_input_t_tensor = backend.is_keras_tensor( 212 layer_utils.get_source_inputs(input_tensor)) 213 except ValueError: 214 raise ValueError('input_tensor: ', input_tensor, 215 'is not type input_tensor') 216 if is_input_t_tensor: 217 if backend.image_data_format() == 'channels_first': 218 if backend.int_shape(input_tensor)[1] != input_shape[1]: 219 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 220 input_tensor, 221 'do not meet the same shape requirements') 222 else: 223 if backend.int_shape(input_tensor)[2] != input_shape[1]: 224 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 225 input_tensor, 226 'do not meet the same shape requirements') 227 else: 228 raise ValueError('input_tensor specified: ', input_tensor, 229 'is not a keras tensor') 230 231 # If input_shape is None, infer shape from input_tensor 232 if input_shape is None and input_tensor is not None: 233 234 try: 235 backend.is_keras_tensor(input_tensor) 236 except ValueError: 237 raise ValueError('input_tensor: ', input_tensor, 'is type: ', 238 type(input_tensor), 'which is not a valid type') 239 240 if input_shape is None and not backend.is_keras_tensor(input_tensor): 241 default_size = 224 242 elif input_shape is None and backend.is_keras_tensor(input_tensor): 243 if backend.image_data_format() == 'channels_first': 244 rows = backend.int_shape(input_tensor)[2] 245 cols = backend.int_shape(input_tensor)[3] 246 else: 247 rows = backend.int_shape(input_tensor)[1] 248 cols = backend.int_shape(input_tensor)[2] 249 250 if rows == cols and rows in [96, 128, 160, 192, 224]: 251 default_size = rows 252 else: 253 default_size = 224 254 255 # If input_shape is None and no input_tensor 256 elif input_shape is None: 257 default_size = 224 258 259 # If input_shape is not None, assume default size 260 else: 261 if backend.image_data_format() == 'channels_first': 262 rows = input_shape[1] 263 cols = input_shape[2] 264 else: 265 rows = input_shape[0] 266 cols = input_shape[1] 267 268 if rows == cols and rows in [96, 128, 160, 192, 224]: 269 default_size = rows 270 else: 271 default_size = 224 272 273 input_shape = imagenet_utils.obtain_input_shape( 274 input_shape, 275 default_size=default_size, 276 min_size=32, 277 data_format=backend.image_data_format(), 278 require_flatten=include_top, 279 weights=weights) 280 281 if backend.image_data_format() == 'channels_last': 282 row_axis, col_axis = (0, 1) 283 else: 284 row_axis, col_axis = (1, 2) 285 rows = input_shape[row_axis] 286 cols = input_shape[col_axis] 287 288 if weights == 'imagenet': 289 if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]: 290 raise ValueError('If imagenet weights are being loaded, ' 291 'alpha can be one of `0.35`, `0.50`, `0.75`, ' 292 '`1.0`, `1.3` or `1.4` only.') 293 294 if rows != cols or rows not in [96, 128, 160, 192, 224]: 295 rows = 224 296 logging.warning('`input_shape` is undefined or non-square, ' 297 'or `rows` is not in [96, 128, 160, 192, 224].' 298 ' Weights for input shape (224, 224) will be' 299 ' loaded as the default.') 300 301 if input_tensor is None: 302 img_input = layers.Input(shape=input_shape) 303 else: 304 if not backend.is_keras_tensor(input_tensor): 305 img_input = layers.Input(tensor=input_tensor, shape=input_shape) 306 else: 307 img_input = input_tensor 308 309 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 310 311 first_block_filters = _make_divisible(32 * alpha, 8) 312 x = layers.Conv2D( 313 first_block_filters, 314 kernel_size=3, 315 strides=(2, 2), 316 padding='same', 317 use_bias=False, 318 name='Conv1')(img_input) 319 x = layers.BatchNormalization( 320 axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')( 321 x) 322 x = layers.ReLU(6., name='Conv1_relu')(x) 323 324 x = _inverted_res_block( 325 x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0) 326 327 x = _inverted_res_block( 328 x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1) 329 x = _inverted_res_block( 330 x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2) 331 332 x = _inverted_res_block( 333 x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3) 334 x = _inverted_res_block( 335 x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4) 336 x = _inverted_res_block( 337 x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5) 338 339 x = _inverted_res_block( 340 x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6) 341 x = _inverted_res_block( 342 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7) 343 x = _inverted_res_block( 344 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8) 345 x = _inverted_res_block( 346 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9) 347 348 x = _inverted_res_block( 349 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10) 350 x = _inverted_res_block( 351 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11) 352 x = _inverted_res_block( 353 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12) 354 355 x = _inverted_res_block( 356 x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13) 357 x = _inverted_res_block( 358 x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14) 359 x = _inverted_res_block( 360 x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15) 361 362 x = _inverted_res_block( 363 x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16) 364 365 # no alpha applied to last conv as stated in the paper: 366 # if the width multiplier is greater than 1 we 367 # increase the number of output channels 368 if alpha > 1.0: 369 last_block_filters = _make_divisible(1280 * alpha, 8) 370 else: 371 last_block_filters = 1280 372 373 x = layers.Conv2D( 374 last_block_filters, kernel_size=1, use_bias=False, name='Conv_1')( 375 x) 376 x = layers.BatchNormalization( 377 axis=channel_axis, epsilon=1e-3, momentum=0.999, name='Conv_1_bn')( 378 x) 379 x = layers.ReLU(6., name='out_relu')(x) 380 381 if include_top: 382 x = layers.GlobalAveragePooling2D()(x) 383 imagenet_utils.validate_activation(classifier_activation, weights) 384 x = layers.Dense(classes, activation=classifier_activation, 385 name='predictions')(x) 386 387 else: 388 if pooling == 'avg': 389 x = layers.GlobalAveragePooling2D()(x) 390 elif pooling == 'max': 391 x = layers.GlobalMaxPooling2D()(x) 392 393 # Ensure that the model takes into account 394 # any potential predecessors of `input_tensor`. 395 if input_tensor is not None: 396 inputs = layer_utils.get_source_inputs(input_tensor) 397 else: 398 inputs = img_input 399 400 # Create model. 401 model = training.Model(inputs, x, name='mobilenetv2_%0.2f_%s' % (alpha, rows)) 402 403 # Load weights. 404 if weights == 'imagenet': 405 if include_top: 406 model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' + 407 str(float(alpha)) + '_' + str(rows) + '.h5') 408 weight_path = BASE_WEIGHT_PATH + model_name 409 weights_path = data_utils.get_file( 410 model_name, weight_path, cache_subdir='models') 411 else: 412 model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' + 413 str(float(alpha)) + '_' + str(rows) + '_no_top' + '.h5') 414 weight_path = BASE_WEIGHT_PATH + model_name 415 weights_path = data_utils.get_file( 416 model_name, weight_path, cache_subdir='models') 417 model.load_weights(weights_path) 418 elif weights is not None: 419 model.load_weights(weights) 420 421 return model 422 423 424def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id): 425 """Inverted ResNet block.""" 426 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 427 428 in_channels = backend.int_shape(inputs)[channel_axis] 429 pointwise_conv_filters = int(filters * alpha) 430 pointwise_filters = _make_divisible(pointwise_conv_filters, 8) 431 x = inputs 432 prefix = 'block_{}_'.format(block_id) 433 434 if block_id: 435 # Expand 436 x = layers.Conv2D( 437 expansion * in_channels, 438 kernel_size=1, 439 padding='same', 440 use_bias=False, 441 activation=None, 442 name=prefix + 'expand')( 443 x) 444 x = layers.BatchNormalization( 445 axis=channel_axis, 446 epsilon=1e-3, 447 momentum=0.999, 448 name=prefix + 'expand_BN')( 449 x) 450 x = layers.ReLU(6., name=prefix + 'expand_relu')(x) 451 else: 452 prefix = 'expanded_conv_' 453 454 # Depthwise 455 if stride == 2: 456 x = layers.ZeroPadding2D( 457 padding=imagenet_utils.correct_pad(x, 3), 458 name=prefix + 'pad')(x) 459 x = layers.DepthwiseConv2D( 460 kernel_size=3, 461 strides=stride, 462 activation=None, 463 use_bias=False, 464 padding='same' if stride == 1 else 'valid', 465 name=prefix + 'depthwise')( 466 x) 467 x = layers.BatchNormalization( 468 axis=channel_axis, 469 epsilon=1e-3, 470 momentum=0.999, 471 name=prefix + 'depthwise_BN')( 472 x) 473 474 x = layers.ReLU(6., name=prefix + 'depthwise_relu')(x) 475 476 # Project 477 x = layers.Conv2D( 478 pointwise_filters, 479 kernel_size=1, 480 padding='same', 481 use_bias=False, 482 activation=None, 483 name=prefix + 'project')( 484 x) 485 x = layers.BatchNormalization( 486 axis=channel_axis, 487 epsilon=1e-3, 488 momentum=0.999, 489 name=prefix + 'project_BN')( 490 x) 491 492 if in_channels == pointwise_filters and stride == 1: 493 return layers.Add(name=prefix + 'add')([inputs, x]) 494 return x 495 496 497def _make_divisible(v, divisor, min_value=None): 498 if min_value is None: 499 min_value = divisor 500 new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 501 # Make sure that round down does not go down by more than 10%. 502 if new_v < 0.9 * v: 503 new_v += divisor 504 return new_v 505 506 507@keras_export('keras.applications.mobilenet_v2.preprocess_input') 508def preprocess_input(x, data_format=None): 509 return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf') 510 511 512@keras_export('keras.applications.mobilenet_v2.decode_predictions') 513def decode_predictions(preds, top=5): 514 return imagenet_utils.decode_predictions(preds, top=top) 515 516 517preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format( 518 mode='', 519 ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF, 520 error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC) 521decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__ 522