1# Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15# pylint: disable=invalid-name 16"""MobileNet v2 models for Keras. 17 18MobileNetV2 is a general architecture and can be used for multiple use cases. 19Depending on the use case, it can use different input layer size and 20different width factors. This allows different width models to reduce 21the number of multiply-adds and thereby 22reduce inference cost on mobile devices. 23 24MobileNetV2 is very similar to the original MobileNet, 25except that it uses inverted residual blocks with 26bottlenecking features. It has a drastically lower 27parameter count than the original MobileNet. 28MobileNets support any input size greater 29than 32 x 32, with larger image sizes 30offering better performance. 31 32The number of parameters and number of multiply-adds 33can be modified by using the `alpha` parameter, 34which increases/decreases the number of filters in each layer. 35By altering the image size and `alpha` parameter, 36all 22 models from the paper can be built, with ImageNet weights provided. 37 38The paper demonstrates the performance of MobileNets using `alpha` values of 391.0 (also called 100 % MobileNet), 0.35, 0.5, 0.75, 1.0, 1.3, and 1.4 40For each of these `alpha` values, weights for 5 different input image sizes 41are provided (224, 192, 160, 128, and 96). 42 43The following table describes the performance of 44MobileNet on various input sizes: 45------------------------------------------------------------------------ 46MACs stands for Multiply Adds 47 Classification Checkpoint|MACs (M)|Parameters (M)|Top 1 Accuracy|Top 5 Accuracy 48--------------------------|------------|---------------|---------|----|--------- 49| [mobilenet_v2_1.4_224] | 582 | 6.06 | 75.0 | 92.5 | 50| [mobilenet_v2_1.3_224] | 509 | 5.34 | 74.4 | 92.1 | 51| [mobilenet_v2_1.0_224] | 300 | 3.47 | 71.8 | 91.0 | 52| [mobilenet_v2_1.0_192] | 221 | 3.47 | 70.7 | 90.1 | 53| [mobilenet_v2_1.0_160] | 154 | 3.47 | 68.8 | 89.0 | 54| [mobilenet_v2_1.0_128] | 99 | 3.47 | 65.3 | 86.9 | 55| [mobilenet_v2_1.0_96] | 56 | 3.47 | 60.3 | 83.2 | 56| [mobilenet_v2_0.75_224] | 209 | 2.61 | 69.8 | 89.6 | 57| [mobilenet_v2_0.75_192] | 153 | 2.61 | 68.7 | 88.9 | 58| [mobilenet_v2_0.75_160] | 107 | 2.61 | 66.4 | 87.3 | 59| [mobilenet_v2_0.75_128] | 69 | 2.61 | 63.2 | 85.3 | 60| [mobilenet_v2_0.75_96] | 39 | 2.61 | 58.8 | 81.6 | 61| [mobilenet_v2_0.5_224] | 97 | 1.95 | 65.4 | 86.4 | 62| [mobilenet_v2_0.5_192] | 71 | 1.95 | 63.9 | 85.4 | 63| [mobilenet_v2_0.5_160] | 50 | 1.95 | 61.0 | 83.2 | 64| [mobilenet_v2_0.5_128] | 32 | 1.95 | 57.7 | 80.8 | 65| [mobilenet_v2_0.5_96] | 18 | 1.95 | 51.2 | 75.8 | 66| [mobilenet_v2_0.35_224] | 59 | 1.66 | 60.3 | 82.9 | 67| [mobilenet_v2_0.35_192] | 43 | 1.66 | 58.2 | 81.2 | 68| [mobilenet_v2_0.35_160] | 30 | 1.66 | 55.7 | 79.1 | 69| [mobilenet_v2_0.35_128] | 20 | 1.66 | 50.8 | 75.0 | 70| [mobilenet_v2_0.35_96] | 11 | 1.66 | 45.5 | 70.4 | 71 72""" 73from __future__ import absolute_import 74from __future__ import division 75from __future__ import print_function 76 77import os 78 79from tensorflow.python.keras import backend 80from tensorflow.python.keras import layers 81from tensorflow.python.keras.applications import imagenet_utils 82from tensorflow.python.keras.engine import training 83from tensorflow.python.keras.utils import data_utils 84from tensorflow.python.keras.utils import layer_utils 85from tensorflow.python.platform import tf_logging as logging 86from tensorflow.python.util.tf_export import keras_export 87 88 89BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/' 90 'keras-applications/mobilenet_v2/') 91 92 93@keras_export('keras.applications.mobilenet_v2.MobileNetV2', 94 'keras.applications.MobileNetV2') 95def MobileNetV2(input_shape=None, 96 alpha=1.0, 97 include_top=True, 98 weights='imagenet', 99 input_tensor=None, 100 pooling=None, 101 classes=1000, 102 **kwargs): 103 """Instantiates the MobileNetV2 architecture. 104 105 Reference paper: 106 - [MobileNetV2: Inverted Residuals and Linear Bottlenecks] 107 (https://arxiv.org/abs/1801.04381) (CVPR 2018) 108 109 Optionally loads weights pre-trained on ImageNet. 110 111 Arguments: 112 input_shape: Optional shape tuple, to be specified if you would 113 like to use a model with an input image resolution that is not 114 (224, 224, 3). 115 It should have exactly 3 inputs channels (224, 224, 3). 116 You can also omit this option if you would like 117 to infer input_shape from an input_tensor. 118 If you choose to include both input_tensor and input_shape then 119 input_shape will be used if they match, if the shapes 120 do not match then we will throw an error. 121 E.g. `(160, 160, 3)` would be one valid value. 122 alpha: Float between 0 and 1. controls the width of the network. 123 This is known as the width multiplier in the MobileNetV2 paper, 124 but the name is kept for consistency with `applications.MobileNetV1` 125 model in Keras. 126 - If `alpha` < 1.0, proportionally decreases the number 127 of filters in each layer. 128 - If `alpha` > 1.0, proportionally increases the number 129 of filters in each layer. 130 - If `alpha` = 1, default number of filters from the paper 131 are used at each layer. 132 include_top: Boolean, whether to include the fully-connected 133 layer at the top of the network. Defaults to `True`. 134 weights: String, one of `None` (random initialization), 135 'imagenet' (pre-training on ImageNet), 136 or the path to the weights file to be loaded. 137 input_tensor: Optional Keras tensor (i.e. output of 138 `layers.Input()`) 139 to use as image input for the model. 140 pooling: String, optional pooling mode for feature extraction 141 when `include_top` is `False`. 142 - `None` means that the output of the model 143 will be the 4D tensor output of the 144 last convolutional block. 145 - `avg` means that global average pooling 146 will be applied to the output of the 147 last convolutional block, and thus 148 the output of the model will be a 149 2D tensor. 150 - `max` means that global max pooling will 151 be applied. 152 classes: Integer, optional number of classes to classify images 153 into, only to be specified if `include_top` is True, and 154 if no `weights` argument is specified. 155 **kwargs: For backwards compatibility only. 156 157 Returns: 158 A `keras.Model` instance. 159 160 Raises: 161 ValueError: in case of invalid argument for `weights`, 162 or invalid input shape or invalid alpha, rows when 163 weights='imagenet' 164 """ 165 if 'layers' in kwargs: 166 global layers 167 layers = kwargs.pop('layers') 168 if kwargs: 169 raise ValueError('Unknown argument(s): %s' % (kwargs,)) 170 if not (weights in {'imagenet', None} or os.path.exists(weights)): 171 raise ValueError('The `weights` argument should be either ' 172 '`None` (random initialization), `imagenet` ' 173 '(pre-training on ImageNet), ' 174 'or the path to the weights file to be loaded.') 175 176 if weights == 'imagenet' and include_top and classes != 1000: 177 raise ValueError('If using `weights` as `"imagenet"` with `include_top` ' 178 'as true, `classes` should be 1000') 179 180 # Determine proper input shape and default size. 181 # If both input_shape and input_tensor are used, they should match 182 if input_shape is not None and input_tensor is not None: 183 try: 184 is_input_t_tensor = backend.is_keras_tensor(input_tensor) 185 except ValueError: 186 try: 187 is_input_t_tensor = backend.is_keras_tensor( 188 layer_utils.get_source_inputs(input_tensor)) 189 except ValueError: 190 raise ValueError('input_tensor: ', input_tensor, 191 'is not type input_tensor') 192 if is_input_t_tensor: 193 if backend.image_data_format == 'channels_first': 194 if backend.int_shape(input_tensor)[1] != input_shape[1]: 195 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 196 input_tensor, 197 'do not meet the same shape requirements') 198 else: 199 if backend.int_shape(input_tensor)[2] != input_shape[1]: 200 raise ValueError('input_shape: ', input_shape, 'and input_tensor: ', 201 input_tensor, 202 'do not meet the same shape requirements') 203 else: 204 raise ValueError('input_tensor specified: ', input_tensor, 205 'is not a keras tensor') 206 207 # If input_shape is None, infer shape from input_tensor 208 if input_shape is None and input_tensor is not None: 209 210 try: 211 backend.is_keras_tensor(input_tensor) 212 except ValueError: 213 raise ValueError('input_tensor: ', input_tensor, 'is type: ', 214 type(input_tensor), 'which is not a valid type') 215 216 if input_shape is None and not backend.is_keras_tensor(input_tensor): 217 default_size = 224 218 elif input_shape is None and backend.is_keras_tensor(input_tensor): 219 if backend.image_data_format() == 'channels_first': 220 rows = backend.int_shape(input_tensor)[2] 221 cols = backend.int_shape(input_tensor)[3] 222 else: 223 rows = backend.int_shape(input_tensor)[1] 224 cols = backend.int_shape(input_tensor)[2] 225 226 if rows == cols and rows in [96, 128, 160, 192, 224]: 227 default_size = rows 228 else: 229 default_size = 224 230 231 # If input_shape is None and no input_tensor 232 elif input_shape is None: 233 default_size = 224 234 235 # If input_shape is not None, assume default size 236 else: 237 if backend.image_data_format() == 'channels_first': 238 rows = input_shape[1] 239 cols = input_shape[2] 240 else: 241 rows = input_shape[0] 242 cols = input_shape[1] 243 244 if rows == cols and rows in [96, 128, 160, 192, 224]: 245 default_size = rows 246 else: 247 default_size = 224 248 249 input_shape = imagenet_utils.obtain_input_shape( 250 input_shape, 251 default_size=default_size, 252 min_size=32, 253 data_format=backend.image_data_format(), 254 require_flatten=include_top, 255 weights=weights) 256 257 if backend.image_data_format() == 'channels_last': 258 row_axis, col_axis = (0, 1) 259 else: 260 row_axis, col_axis = (1, 2) 261 rows = input_shape[row_axis] 262 cols = input_shape[col_axis] 263 264 if weights == 'imagenet': 265 if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]: 266 raise ValueError('If imagenet weights are being loaded, ' 267 'alpha can be one of `0.35`, `0.50`, `0.75`, ' 268 '`1.0`, `1.3` or `1.4` only.') 269 270 if rows != cols or rows not in [96, 128, 160, 192, 224]: 271 rows = 224 272 logging.warning('`input_shape` is undefined or non-square, ' 273 'or `rows` is not in [96, 128, 160, 192, 224].' 274 ' Weights for input shape (224, 224) will be' 275 ' loaded as the default.') 276 277 if input_tensor is None: 278 img_input = layers.Input(shape=input_shape) 279 else: 280 if not backend.is_keras_tensor(input_tensor): 281 img_input = layers.Input(tensor=input_tensor, shape=input_shape) 282 else: 283 img_input = input_tensor 284 285 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 286 287 first_block_filters = _make_divisible(32 * alpha, 8) 288 x = layers.ZeroPadding2D( 289 padding=imagenet_utils.correct_pad(img_input, 3), 290 name='Conv1_pad')(img_input) 291 x = layers.Conv2D( 292 first_block_filters, 293 kernel_size=3, 294 strides=(2, 2), 295 padding='valid', 296 use_bias=False, 297 name='Conv1')( 298 x) 299 x = layers.BatchNormalization( 300 axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')( 301 x) 302 x = layers.ReLU(6., name='Conv1_relu')(x) 303 304 x = _inverted_res_block( 305 x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0) 306 307 x = _inverted_res_block( 308 x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1) 309 x = _inverted_res_block( 310 x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2) 311 312 x = _inverted_res_block( 313 x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3) 314 x = _inverted_res_block( 315 x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4) 316 x = _inverted_res_block( 317 x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5) 318 319 x = _inverted_res_block( 320 x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6) 321 x = _inverted_res_block( 322 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7) 323 x = _inverted_res_block( 324 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8) 325 x = _inverted_res_block( 326 x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9) 327 328 x = _inverted_res_block( 329 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10) 330 x = _inverted_res_block( 331 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11) 332 x = _inverted_res_block( 333 x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12) 334 335 x = _inverted_res_block( 336 x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13) 337 x = _inverted_res_block( 338 x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14) 339 x = _inverted_res_block( 340 x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15) 341 342 x = _inverted_res_block( 343 x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16) 344 345 # no alpha applied to last conv as stated in the paper: 346 # if the width multiplier is greater than 1 we 347 # increase the number of output channels 348 if alpha > 1.0: 349 last_block_filters = _make_divisible(1280 * alpha, 8) 350 else: 351 last_block_filters = 1280 352 353 x = layers.Conv2D( 354 last_block_filters, kernel_size=1, use_bias=False, name='Conv_1')( 355 x) 356 x = layers.BatchNormalization( 357 axis=channel_axis, epsilon=1e-3, momentum=0.999, name='Conv_1_bn')( 358 x) 359 x = layers.ReLU(6., name='out_relu')(x) 360 361 if include_top: 362 x = layers.GlobalAveragePooling2D()(x) 363 x = layers.Dense( 364 classes, activation='softmax', use_bias=True, name='Logits')( 365 x) 366 else: 367 if pooling == 'avg': 368 x = layers.GlobalAveragePooling2D()(x) 369 elif pooling == 'max': 370 x = layers.GlobalMaxPooling2D()(x) 371 372 # Ensure that the model takes into account 373 # any potential predecessors of `input_tensor`. 374 if input_tensor is not None: 375 inputs = layer_utils.get_source_inputs(input_tensor) 376 else: 377 inputs = img_input 378 379 # Create model. 380 model = training.Model(inputs, x, name='mobilenetv2_%0.2f_%s' % (alpha, rows)) 381 382 # Load weights. 383 if weights == 'imagenet': 384 if include_top: 385 model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' + 386 str(alpha) + '_' + str(rows) + '.h5') 387 weight_path = BASE_WEIGHT_PATH + model_name 388 weights_path = data_utils.get_file( 389 model_name, weight_path, cache_subdir='models') 390 else: 391 model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' + 392 str(alpha) + '_' + str(rows) + '_no_top' + '.h5') 393 weight_path = BASE_WEIGHT_PATH + model_name 394 weights_path = data_utils.get_file( 395 model_name, weight_path, cache_subdir='models') 396 model.load_weights(weights_path) 397 elif weights is not None: 398 model.load_weights(weights) 399 400 return model 401 402 403def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id): 404 """Inverted ResNet block.""" 405 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 406 407 in_channels = backend.int_shape(inputs)[channel_axis] 408 pointwise_conv_filters = int(filters * alpha) 409 pointwise_filters = _make_divisible(pointwise_conv_filters, 8) 410 x = inputs 411 prefix = 'block_{}_'.format(block_id) 412 413 if block_id: 414 # Expand 415 x = layers.Conv2D( 416 expansion * in_channels, 417 kernel_size=1, 418 padding='same', 419 use_bias=False, 420 activation=None, 421 name=prefix + 'expand')( 422 x) 423 x = layers.BatchNormalization( 424 axis=channel_axis, 425 epsilon=1e-3, 426 momentum=0.999, 427 name=prefix + 'expand_BN')( 428 x) 429 x = layers.ReLU(6., name=prefix + 'expand_relu')(x) 430 else: 431 prefix = 'expanded_conv_' 432 433 # Depthwise 434 if stride == 2: 435 x = layers.ZeroPadding2D( 436 padding=imagenet_utils.correct_pad(x, 3), 437 name=prefix + 'pad')(x) 438 x = layers.DepthwiseConv2D( 439 kernel_size=3, 440 strides=stride, 441 activation=None, 442 use_bias=False, 443 padding='same' if stride == 1 else 'valid', 444 name=prefix + 'depthwise')( 445 x) 446 x = layers.BatchNormalization( 447 axis=channel_axis, 448 epsilon=1e-3, 449 momentum=0.999, 450 name=prefix + 'depthwise_BN')( 451 x) 452 453 x = layers.ReLU(6., name=prefix + 'depthwise_relu')(x) 454 455 # Project 456 x = layers.Conv2D( 457 pointwise_filters, 458 kernel_size=1, 459 padding='same', 460 use_bias=False, 461 activation=None, 462 name=prefix + 'project')( 463 x) 464 x = layers.BatchNormalization( 465 axis=channel_axis, 466 epsilon=1e-3, 467 momentum=0.999, 468 name=prefix + 'project_BN')( 469 x) 470 471 if in_channels == pointwise_filters and stride == 1: 472 return layers.Add(name=prefix + 'add')([inputs, x]) 473 return x 474 475 476def _make_divisible(v, divisor, min_value=None): 477 if min_value is None: 478 min_value = divisor 479 new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 480 # Make sure that round down does not go down by more than 10%. 481 if new_v < 0.9 * v: 482 new_v += divisor 483 return new_v 484 485 486@keras_export('keras.applications.mobilenet_v2.preprocess_input') 487def preprocess_input(x, data_format=None): 488 """Preprocesses the input (encoding a batch of images) for the model.""" 489 return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf') 490 491 492@keras_export('keras.applications.mobilenet_v2.decode_predictions') 493def decode_predictions(preds, top=5): 494 """Decodes the prediction result from the model.""" 495 return imagenet_utils.decode_predictions(preds, top=top) 496